From 7510ec796096b1d1e70d48b2f04b53442c5943f2 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Wed, 29 Oct 2014 15:47:24 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12655
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 lib/gpu/geryon/VERSION.txt         |   2 +-
 lib/gpu/geryon/nvd_device.h        |   6 ++
 lib/gpu/geryon/ocl_device.h        |  13 ++-
 lib/gpu/geryon/ocl_memory.h        |   5 +-
 lib/gpu/lal_answer.cpp             |  16 ++-
 lib/gpu/lal_aux_fun1.h             |  71 +++++++-------
 lib/gpu/lal_base_ellipsoid.cpp     |   8 +-
 lib/gpu/lal_beck.cu                |  16 +--
 lib/gpu/lal_beck_ext.cpp           |   6 +-
 lib/gpu/lal_born.cu                |  16 +--
 lib/gpu/lal_born_coul_long.cu      |  16 +--
 lib/gpu/lal_born_coul_long_ext.cpp |   6 +-
 lib/gpu/lal_born_coul_wolf.cu      |  44 +++++----
 lib/gpu/lal_born_coul_wolf_ext.cpp |   6 +-
 lib/gpu/lal_born_ext.cpp           |   6 +-
 lib/gpu/lal_buck.cu                |  16 +--
 lib/gpu/lal_buck_coul.cu           |  16 +--
 lib/gpu/lal_buck_coul_ext.cpp      |   6 +-
 lib/gpu/lal_buck_coul_long.cu      |  16 +--
 lib/gpu/lal_buck_coul_long_ext.cpp |   6 +-
 lib/gpu/lal_buck_ext.cpp           |   6 +-
 lib/gpu/lal_cg_cmm.cu              |  16 +--
 lib/gpu/lal_cg_cmm_ext.cpp         |   6 +-
 lib/gpu/lal_cg_cmm_long.cu         |  16 +--
 lib/gpu/lal_cg_cmm_long_ext.cpp    |   6 +-
 lib/gpu/lal_charmm_long.cpp        |  11 ++-
 lib/gpu/lal_charmm_long.cu         |  28 ++----
 lib/gpu/lal_charmm_long_ext.cpp    |   6 +-
 lib/gpu/lal_colloid.cu             |  16 +--
 lib/gpu/lal_colloid_ext.cpp        |   6 +-
 lib/gpu/lal_coul.cu                |  23 +++--
 lib/gpu/lal_coul_debye.cu          |  52 +++++-----
 lib/gpu/lal_coul_dsf.cu            |  16 +--
 lib/gpu/lal_coul_dsf_ext.cpp       |   6 +-
 lib/gpu/lal_coul_long.cu           |  54 +++++-----
 lib/gpu/lal_coul_long_ext.cpp      |   6 +-
 lib/gpu/lal_device.cpp             |  23 +++--
 lib/gpu/lal_device.h               |  10 +-
 lib/gpu/lal_dipole_lj.cu           |  44 ++++-----
 lib/gpu/lal_dipole_lj_ext.cpp      |   6 +-
 lib/gpu/lal_dipole_lj_sf.cu        |  44 ++++-----
 lib/gpu/lal_dipole_lj_sf_ext.cpp   |   6 +-
 lib/gpu/lal_dpd.cu                 |  18 ++--
 lib/gpu/lal_eam.cu                 |  51 +++++-----
 lib/gpu/lal_eam_ext.cpp            |   6 +-
 lib/gpu/lal_ellipsoid_extra.h      |  15 ++-
 lib/gpu/lal_ellipsoid_nbor.cu      |  34 +++----
 lib/gpu/lal_gauss.cu               |  16 +--
 lib/gpu/lal_gauss_ext.cpp          |   6 +-
 lib/gpu/lal_gayberne.cu            |   4 +-
 lib/gpu/lal_gayberne_ext.cpp       |   6 +-
 lib/gpu/lal_gayberne_lj.cu         |  20 ++--
 lib/gpu/lal_lj.cu                  |  26 ++---
 lib/gpu/lal_lj96.cu                |  16 +--
 lib/gpu/lal_lj96_ext.cpp           |   6 +-
 lib/gpu/lal_lj_class2_long.cu      |  16 +--
 lib/gpu/lal_lj_class2_long_ext.cpp |   6 +-
 lib/gpu/lal_lj_coul.cu             |  16 +--
 lib/gpu/lal_lj_coul_debye.cu       |  16 +--
 lib/gpu/lal_lj_coul_debye_ext.cpp  |   6 +-
 lib/gpu/lal_lj_coul_ext.cpp        |   6 +-
 lib/gpu/lal_lj_coul_long.cpp       |  19 +++-
 lib/gpu/lal_lj_coul_long.cu        |  16 +--
 lib/gpu/lal_lj_coul_long.h         |   5 +
 lib/gpu/lal_lj_coul_long_ext.cpp   |  29 +++++-
 lib/gpu/lal_lj_coul_msm.cu         |  16 +--
 lib/gpu/lal_lj_coul_msm_ext.cpp    |   6 +-
 lib/gpu/lal_lj_dsf.cu              |  16 +--
 lib/gpu/lal_lj_dsf_ext.cpp         |   6 +-
 lib/gpu/lal_lj_expand.cu           |  16 +--
 lib/gpu/lal_lj_expand_ext.cpp      |   6 +-
 lib/gpu/lal_lj_ext.cpp             |   6 +-
 lib/gpu/lal_lj_gromacs.cu          |  18 ++--
 lib/gpu/lal_mie.cu                 |  16 +--
 lib/gpu/lal_mie_ext.cpp            |   6 +-
 lib/gpu/lal_morse.cu               |  16 +--
 lib/gpu/lal_morse_ext.cpp          |   6 +-
 lib/gpu/lal_neighbor.cpp           |   7 +-
 lib/gpu/lal_neighbor.h             |   1 +
 lib/gpu/lal_neighbor_cpu.cu        |  10 +-
 lib/gpu/lal_pppm.cpp               |   2 +-
 lib/gpu/lal_pppm_ext.cpp           |   6 +-
 lib/gpu/lal_preprocessor.h         |  50 ++++++++++
 lib/gpu/lal_re_squared.cu          |   4 +-
 lib/gpu/lal_re_squared_ext.cpp     |   6 +-
 lib/gpu/lal_re_squared_lj.cu       |  24 ++---
 lib/gpu/lal_soft.cu                |  16 +--
 lib/gpu/lal_soft_ext.cpp           |   6 +-
 lib/gpu/lal_sw.cu                  | 152 ++++++++++++++++-------------
 lib/gpu/lal_table.cu               |  64 ++++++------
 lib/gpu/lal_table_ext.cpp          |   6 +-
 lib/gpu/lal_yukawa.cu              |  16 +--
 lib/gpu/lal_yukawa_colloid.cu      |  16 +--
 lib/gpu/lal_yukawa_colloid_ext.cpp |   6 +-
 lib/gpu/lal_yukawa_ext.cpp         |   6 +-
 95 files changed, 850 insertions(+), 745 deletions(-)

diff --git a/lib/gpu/geryon/VERSION.txt b/lib/gpu/geryon/VERSION.txt
index 733136e842..39b132ac54 100644
--- a/lib/gpu/geryon/VERSION.txt
+++ b/lib/gpu/geryon/VERSION.txt
@@ -1 +1 @@
-Geryon Version 13.209
+Geryon Version 13.234
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index 5fffe77c82..12a18ae873 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -159,6 +159,12 @@ class UCL_Device {
   /// Returns true if double precision is support for the device
   inline bool double_precision(const int i) {return arch(i)>=1.3;}
   
+  /// Get the number of compute units on the current device
+  inline unsigned cus() { return cus(_device); }
+  /// Get the number of compute units
+  inline unsigned cus(const int i) 
+    { return _properties[i].multiProcessorCount; }
+
   /// Get the number of cores in the current device
   inline unsigned cores() { return cores(_device); }
   /// Get the number of cores
diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index 79fa53d552..8dadcf2efd 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -185,13 +185,12 @@ class UCL_Device {
   inline bool double_precision(const int i) 
     {return _properties[i].double_precision;}
    
-  /// Get the number of cores in the current device
-  inline unsigned cores() { return cores(_device); }
-  /// Get the number of cores
-  inline unsigned cores(const int i) 
-    { if (device_type(i)==UCL_CPU) return _properties[i].compute_units;
-      else return _properties[i].compute_units*8; }
-  
+  /// Get the number of compute units on the current device
+  inline unsigned cus() { return cus(_device); }
+  /// Get the number of compute units
+  inline unsigned cus(const int i) 
+    { return _properties[i].compute_units; }
+
   /// Get the gigabytes of global memory in the current device
   inline double gigabytes() { return gigabytes(_device); }
   /// Get the gigabytes of global memory
diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h
index 09089a52ce..7aed0a1a8c 100644
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@@ -127,7 +127,8 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
   orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
   
   mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
-                              mat.host_ptr(), &error_flag);
+                              *mat.host_ptr(), &error_flag);
+
   CL_CHECK_ERR(error_flag);
   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
   return UCL_SUCCESS;
@@ -174,7 +175,7 @@ template <class mat_type>
 inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) {
   cl_int error_flag;
   mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
-                              n,mat.host_ptr(),&error_flag);
+                              n,*mat.host_ptr(),&error_flag);
   CL_CHECK_ERR(error_flag);
   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
   return UCL_SUCCESS;
diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp
index ddf893e4ed..c1f4999207 100644
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@@ -221,7 +221,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
     return energy_virial(eatom,vatom,virial);
 
   double evdwl=0.0;
-  int vstart=0, iend=_inum;
+  int ii, vstart=0, iend=_inum;
   if (_eflag) {
     iend=_inum*2;
     for (int i=0; i<_inum; i++)
@@ -235,10 +235,10 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
         for (int i=_inum; i<iend; i++)
           eatom[i]+=engv[i];
       } else {
-        for (int i=0; i<_inum; i++)
-          eatom[_ilist[i]]+=engv[i];
-        for (int i=_inum; i<iend; i++)
-          eatom[_ilist[i]]+=engv[i];
+        for (int i=0, ii=0; i<_inum; i++)
+          eatom[_ilist[ii++]]+=engv[i];
+        for (int i=_inum, ii=0; i<iend; i++)
+          eatom[_ilist[ii++]]+=engv[i];
       }
     vstart=iend;
     iend+=_inum;
@@ -249,12 +249,10 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
         virial[j]+=engv[i];
       if (_vf_atom)
         if (_ilist==NULL) {
-          int ii=0;
           for (int i=vstart; i<iend; i++)
-            vatom[ii++][j]+=engv[i];
+            vatom[i++][j]+=engv[i];
         } else {
-          int ii=0;
-          for (int i=vstart; i<iend; i++)
+          for (int i=vstart, ii=0; i<iend; i++)
             vatom[_ilist[ii++]][j]+=engv[i];
         }  
       vstart+=_inum;
diff --git a/lib/gpu/lal_aux_fun1.h b/lib/gpu/lal_aux_fun1.h
index 2c57e1f6a5..b40bb7f943 100644
--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@@ -23,22 +23,21 @@
   ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
 
 #define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \
-                  i, numj, stride, list_end, nbor)                           \
-  nbor=nbor_mem+ii;                                                          \
-  i=*nbor;                                                                   \
-  nbor+=nbor_stride;                                                         \
-  numj=*nbor;                                                                \
+                  i, numj, stride, nbor_end, nbor_begin)                     \
+  i=nbor_mem[ii];                                                            \
+  nbor_begin=ii+nbor_stride;                                                 \
+  numj=nbor_mem[nbor_begin];                                                 \
   if (nbor_mem==packed_mem) {                                                \
-    nbor+=nbor_stride+fast_mul(ii,t_per_atom-1);                             \
+    nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1);                       \
     stride=fast_mul(t_per_atom,nbor_stride);                                 \
-    list_end=nbor+fast_mul(numj/t_per_atom,stride)+ (numj & (t_per_atom-1)); \
-    nbor+=offset;                                                            \
+    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \
+    nbor_begin+=offset;                                                      \
   } else {                                                                   \
-    nbor+=nbor_stride;                                                       \
-    nbor=packed_mem+*nbor;                                                   \
-    list_end=nbor+numj;                                                      \
+    nbor_begin+=nbor_stride;                                                 \
+    nbor_begin=nbor_mem[nbor_begin];                                         \
+    nbor_end=nbor_begin+numj;                                                \
     stride=t_per_atom;                                                       \
-    nbor+=offset;                                                            \
+    nbor_begin+=offset;                                                      \
   }
 
 #if (ARCH < 300)
@@ -75,15 +74,15 @@
     }                                                                       \
   }                                                                         \
   if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
     if (eflag>0) {                                                          \
-      *engv=energy*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
+      engv[ei]=energy*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+        ei+=inum;                                                           \
       }                                                                     \
     }                                                                       \
     ans[ii]=f;                                                              \
@@ -123,17 +122,17 @@
     }                                                                       \
   }                                                                         \
   if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
     if (eflag>0) {                                                          \
-      *engv=energy*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
-      *engv=e_coul*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
+      engv[ei]=energy*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
+      engv[ei]=e_coul*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+        ei+=inum;                                                           \
       }                                                                     \
     }                                                                       \
     ans[ii]=f;                                                              \
@@ -158,15 +157,15 @@
     }                                                                       \
   }                                                                         \
   if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
     if (eflag>0) {                                                          \
-      *engv=energy*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
+      engv[ei]=energy*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+        ei+=inum;                                                           \
       }                                                                     \
     }                                                                       \
     ans[ii]=f;                                                              \
@@ -190,17 +189,17 @@
     }                                                                       \
   }                                                                         \
   if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
     if (eflag>0) {                                                          \
-      *engv=energy*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
-      *engv=e_coul*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
+      engv[ei]=energy*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
+      engv[ei]=e_coul*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+        ei+=inum;                                                           \
       }                                                                     \
     }                                                                       \
     ans[ii]=f;                                                              \
diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp
index dd83bfa9a4..4200c02e1c 100644
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@@ -202,6 +202,7 @@ void BaseEllipsoidT::output_times() {
   MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
              device->replica());
   double max_mb=mpi_max_bytes/(1024*1024);
+  double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
 
   if (device->replica_me()==0)
     if (screen && times[5]>0.0) {
@@ -209,11 +210,11 @@ void BaseEllipsoidT::output_times() {
 
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"      GPU Time Info (average): ");
+      fprintf(screen,"    Device Time Info (average): ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
-      if (device->procs_per_gpu()==1) {
+      if (device->procs_per_gpu()==1 && t_time>0) {
         fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/replica_size);
         fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[5]/replica_size);
         fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/replica_size);
@@ -226,7 +227,8 @@ void BaseEllipsoidT::output_times() {
       }
       if (nbor->gpu_nbor()==2)
         fprintf(screen,"Neighbor (CPU):  %.4f s.\n",times[9]/replica_size);
-      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[6]/replica_size);
+      if (times[6]>0)
+        fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
       fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);      
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
diff --git a/lib/gpu/lal_beck.cu b/lib/gpu/lal_beck.cu
index 479587047c..7ccefd8859 100644
--- a/lib/gpu/lal_beck.cu
+++ b/lib/gpu/lal_beck.cu
@@ -52,19 +52,19 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -154,20 +154,20 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_beck_ext.cpp b/lib/gpu/lal_beck_ext.cpp
index 40afe9ca27..28ca0df346 100644
--- a/lib/gpu/lal_beck_ext.cpp
+++ b/lib/gpu/lal_beck_ext.cpp
@@ -48,7 +48,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -65,9 +65,9 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_born.cu b/lib/gpu/lal_born.cu
index b4b3a64d22..5f917be846 100644
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@@ -53,19 +53,19 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -148,20 +148,20 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_born_coul_long.cu b/lib/gpu/lal_born_coul_long.cu
index d8198d6832..3d74f2087a 100644
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@@ -66,18 +66,18 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
@@ -188,19 +188,19 @@ __kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_born_coul_long_ext.cpp b/lib/gpu/lal_born_coul_long_ext.cpp
index 9734fe9f2b..382e9a2b2c 100644
--- a/lib/gpu/lal_born_coul_long_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_ext.cpp
@@ -52,7 +52,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -71,9 +71,9 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_born_coul_wolf.cu b/lib/gpu/lal_born_coul_wolf.cu
index 9727f7a017..e7706b408a 100644
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@@ -69,11 +69,11 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
@@ -85,8 +85,8 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
       e_coul += (acctyp)2.0*e_self;
     }
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
@@ -105,7 +105,7 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
       int mtype=itype*lj_types+jtype;
       if (rsq<cutsq_sigma[mtype].x) { // cutsq
         numtyp r2inv = ucl_recip(rsq);
-        numtyp forcecoul, forceborn, force, r6inv, prefactor, erfcc;
+        numtyp forcecoul, forceborn, force, r6inv, prefactor;
         numtyp v_sh = (numtyp)0.0;
         numtyp rexp = (numtyp)0.0;
         
@@ -124,10 +124,11 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
           fetch(prefactor,j,q_tex);
           prefactor *= qqrd2e * qtmp/r;
 
-          erfcc = erfc(arij);
+          const numtyp erfcc = erfc(arij);
           v_sh = (erfcc - e_shift*r)*prefactor;
           numtyp dvdrr = (erfcc/rsq + (numtyp)2.0*alf/MY_PIS * erfcd/r) + f_shift;
-          forcecoul = prefactor * dvdrr*rsq*factor_coul;
+          forcecoul = prefactor * dvdrr*rsq;
+          if (factor_coul < (numtyp)1.0) forcecoul -= ((numtyp)1.0-factor_coul)*prefactor;
         } else forcecoul = (numtyp)0.0;
 
         force = (forceborn + forcecoul) * r2inv;
@@ -137,8 +138,11 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
         f.z+=delz*force;
 
         if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += v_sh*factor_coul;
+          if (rsq < cut_coulsq) {
+            numtyp e=v_sh;
+            if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
+            e_coul += e;
+          }
           if (rsq < cutsq_sigma[mtype].y) {
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
               + coeff2[mtype].z*r2inv*r6inv;
@@ -201,11 +205,11 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
@@ -218,8 +222,8 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
       e_coul += (acctyp)2.0*e_self;
     }
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
@@ -237,7 +241,7 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
 
       if (rsq<cutsq_sigma[mtype].x) {
         numtyp r2inv=ucl_recip(rsq);
-        numtyp forcecoul, forceborn, force, r6inv, prefactor, erfcc;
+        numtyp forcecoul, forceborn, force, r6inv, prefactor;
         numtyp v_sh = (numtyp)0.0;
         numtyp rexp = (numtyp)0.0;
         
@@ -256,10 +260,11 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
           fetch(prefactor,j,q_tex);
           prefactor *= qqrd2e * qtmp/r;
 
-          erfcc = erfc(arij);
+          const numtyp erfcc = erfc(arij);
           v_sh = (erfcc - e_shift*r)*prefactor;
           numtyp dvdrr = (erfcc/rsq + (numtyp)2.0*alf/MY_PIS * erfcd/r) + f_shift;
-          forcecoul = prefactor * dvdrr*rsq*factor_coul;
+          forcecoul = prefactor * dvdrr*rsq;
+          if (factor_coul < (numtyp)1.0) forcecoul -= ((numtyp)1.0-factor_coul)*prefactor;
         } else forcecoul = (numtyp)0.0;
 
         force = (forceborn + forcecoul) * r2inv;
@@ -269,8 +274,11 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
         f.z+=delz*force;
 
         if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += v_sh*factor_coul;
+          if (rsq < cut_coulsq) {
+            numtyp e=v_sh;
+            if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
+            e_coul += e;
+          }
           if (rsq < cutsq_sigma[mtype].y) {
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
               + coeff2[mtype].z*r2inv*r6inv;
diff --git a/lib/gpu/lal_born_coul_wolf_ext.cpp b/lib/gpu/lal_born_coul_wolf_ext.cpp
index 5c9e2c02bf..b56c526119 100644
--- a/lib/gpu/lal_born_coul_wolf_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_ext.cpp
@@ -52,7 +52,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -72,9 +72,9 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_born_ext.cpp b/lib/gpu/lal_born_ext.cpp
index 4dcc899623..6bd51e6d68 100644
--- a/lib/gpu/lal_born_ext.cpp
+++ b/lib/gpu/lal_born_ext.cpp
@@ -50,7 +50,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -68,9 +68,9 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu
index a592c71923..955547e598 100644
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@@ -52,19 +52,19 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -145,20 +145,20 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_buck_coul.cu b/lib/gpu/lal_buck_coul.cu
index 4a70efe197..87604a02ea 100644
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@@ -65,18 +65,18 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
     
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
       
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
@@ -182,19 +182,19 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
       
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_buck_coul_ext.cpp b/lib/gpu/lal_buck_coul_ext.cpp
index bbf076199b..dd696fc6bb 100644
--- a/lib/gpu/lal_buck_coul_ext.cpp
+++ b/lib/gpu/lal_buck_coul_ext.cpp
@@ -51,7 +51,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -70,9 +70,9 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_buck_coul_long.cu b/lib/gpu/lal_buck_coul_long.cu
index e10943fd1b..fc68d12471 100644
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@@ -66,18 +66,18 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
@@ -190,19 +190,19 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_buck_coul_long_ext.cpp b/lib/gpu/lal_buck_coul_long_ext.cpp
index 80382ff5b3..9c0c331ee1 100644
--- a/lib/gpu/lal_buck_coul_long_ext.cpp
+++ b/lib/gpu/lal_buck_coul_long_ext.cpp
@@ -52,7 +52,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -70,9 +70,9 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_buck_ext.cpp b/lib/gpu/lal_buck_ext.cpp
index b118a5b0eb..75c88e8dbe 100644
--- a/lib/gpu/lal_buck_ext.cpp
+++ b/lib/gpu/lal_buck_ext.cpp
@@ -49,7 +49,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -66,9 +66,9 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_cg_cmm.cu b/lib/gpu/lal_cg_cmm.cu
index 0295f59c30..8f89f74d22 100644
--- a/lib/gpu/lal_cg_cmm.cu
+++ b/lib/gpu/lal_cg_cmm.cu
@@ -52,19 +52,19 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -150,20 +150,20 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_cg_cmm_ext.cpp b/lib/gpu/lal_cg_cmm_ext.cpp
index a108f7525d..0d2c3d8fbf 100644
--- a/lib/gpu/lal_cg_cmm_ext.cpp
+++ b/lib/gpu/lal_cg_cmm_ext.cpp
@@ -49,7 +49,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -66,9 +66,9 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_cg_cmm_long.cu b/lib/gpu/lal_cg_cmm_long.cu
index 1ff506a34d..ae8b6cda47 100644
--- a/lib/gpu/lal_cg_cmm_long.cu
+++ b/lib/gpu/lal_cg_cmm_long.cu
@@ -65,18 +65,18 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
@@ -191,19 +191,19 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_cg_cmm_long_ext.cpp b/lib/gpu/lal_cg_cmm_long_ext.cpp
index f4c1fd40d1..966588bf9b 100644
--- a/lib/gpu/lal_cg_cmm_long_ext.cpp
+++ b/lib/gpu/lal_cg_cmm_long_ext.cpp
@@ -51,7 +51,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -69,9 +69,9 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_charmm_long.cpp b/lib/gpu/lal_charmm_long.cpp
index b2fafc46ff..157072dc22 100644
--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@@ -66,13 +66,14 @@ int CHARMMLongT::init(const int ntypes,
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
-  if (this->_block_bio_size>=64 && mix_arithmetic)
+  int max_bio_shared_types=this->device->max_bio_shared_types();
+  if (this->_block_bio_size>=64 && mix_arithmetic &&
+      lj_types<=max_bio_shared_types)
     shared_types=true;
   _lj_types=lj_types;
 
   // Allocate a host write buffer for data initialization
   int h_size=lj_types*lj_types;
-  int max_bio_shared_types=this->device->max_bio_shared_types();
   if (h_size<max_bio_shared_types)
     h_size=max_bio_shared_types;
   UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
@@ -84,8 +85,10 @@ int CHARMMLongT::init(const int ntypes,
   this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
                          host_lj3,host_lj4);
 
-  ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
+  if (shared_types) {
+    ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
+    this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
+  }
 
   sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
diff --git a/lib/gpu/lal_charmm_long.cu b/lib/gpu/lal_charmm_long.cu
index 3e473a031a..dde50da300 100644
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@@ -32,7 +32,7 @@ texture<int2> q_tex;
 __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
                             const __global numtyp4 *restrict lj1,
                             const int lj_types, 
-                            const __global numtyp *restrict sp_lj_in,
+                            const __global numtyp *restrict sp_lj,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
                             __global acctyp4 *restrict ans,
@@ -47,16 +47,6 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  __local numtyp sp_lj[8];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
-  sp_lj[4]=sp_lj_in[4];
-  sp_lj[5]=sp_lj_in[5];
-  sp_lj[6]=sp_lj_in[6];
-  sp_lj[7]=sp_lj_in[7];
-
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -66,18 +56,18 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
@@ -196,18 +186,18 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_charmm_long_ext.cpp b/lib/gpu/lal_charmm_long_ext.cpp
index e9ffc16939..807988a3e8 100644
--- a/lib/gpu/lal_charmm_long_ext.cpp
+++ b/lib/gpu/lal_charmm_long_ext.cpp
@@ -53,7 +53,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -72,9 +72,9 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_colloid.cu b/lib/gpu/lal_colloid.cu
index a8f0cf63ca..a4d6c8bf33 100644
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@@ -55,19 +55,19 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -219,20 +219,20 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_colloid_ext.cpp b/lib/gpu/lal_colloid_ext.cpp
index 037a982cf4..ea83cb6417 100644
--- a/lib/gpu/lal_colloid_ext.cpp
+++ b/lib/gpu/lal_colloid_ext.cpp
@@ -51,7 +51,7 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -70,9 +70,9 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_coul.cu b/lib/gpu/lal_coul.cu
index 7f8b9af6a0..e955922a7c 100644
--- a/lib/gpu/lal_coul.cu
+++ b/lib/gpu/lal_coul.cu
@@ -60,20 +60,19 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    numtyp factor_coul;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      numtyp factor_coul;
+      int j=dev_packed[nbor];
       factor_coul = sp_cl[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -152,21 +151,21 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    numtyp factor_coul;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      numtyp factor_coul = sp_cl[sbmask(j)];
+      int j=dev_packed[nbor];
+      factor_coul = sp_cl[sbmask(j)];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
diff --git a/lib/gpu/lal_coul_debye.cu b/lib/gpu/lal_coul_debye.cu
index a1c5da34ba..0e4c0ea2d0 100644
--- a/lib/gpu/lal_coul_debye.cu
+++ b/lib/gpu/lal_coul_debye.cu
@@ -30,19 +30,19 @@ texture<int2> q_tex;
 #endif
 
 __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
-                         const __global numtyp *restrict scale,
-                         const int lj_types, 
-                         const __global numtyp *restrict sp_cl_in,
-                         const __global int *dev_nbor, 
-                         const __global int *dev_packed, 
-                         __global acctyp4 *restrict ans,
-                         __global acctyp *restrict engv,
-                         const int eflag, const int vflag, const int inum,
-                         const int nbor_pitch,
-                         const __global numtyp *restrict q_ ,
-                         const __global numtyp *restrict cutsq, 
-                         const numtyp qqrd2e, const numtyp kappa,
-                         const int t_per_atom) {
+                           const __global numtyp *restrict scale,
+                           const int lj_types, 
+                           const __global numtyp *restrict sp_cl_in,
+                           const __global int *dev_nbor, 
+                           const __global int *dev_packed, 
+                           __global acctyp4 *restrict ans,
+                           __global acctyp *restrict engv,
+                           const int eflag, const int vflag, const int inum,
+                           const int nbor_pitch,
+                           const __global numtyp *restrict q_ ,
+                           const __global numtyp *restrict cutsq, 
+                           const numtyp qqrd2e, const numtyp kappa,
+                           const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -61,20 +61,20 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_coul = sp_cl[sbmask(j)];
+    numtyp factor_coul;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+  
+      int j=dev_packed[nbor];
+      factor_coul = sp_cl[sbmask(j)];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -158,21 +158,21 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    numtyp factor_coul;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      numtyp factor_coul = sp_cl[sbmask(j)];
+      int j=dev_packed[nbor];
+      factor_coul = sp_cl[sbmask(j)];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
diff --git a/lib/gpu/lal_coul_dsf.cu b/lib/gpu/lal_coul_dsf.cu
index 5a8b82bcb6..56b3a531bc 100644
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@@ -62,11 +62,11 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
@@ -77,8 +77,8 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
       e_coul += (acctyp)2.0*e_self;
     }
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_coul, r, prefactor, erfcc;
       factor_coul = sp_lj[sbmask(j)];
@@ -163,11 +163,11 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
@@ -178,8 +178,8 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
       e_coul += (acctyp)2.0*e_self;
     }
  
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_coul, r, prefactor, erfcc;
       factor_coul = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_coul_dsf_ext.cpp b/lib/gpu/lal_coul_dsf_ext.cpp
index 4f2b7131d0..e65a090a16 100644
--- a/lib/gpu/lal_coul_dsf_ext.cpp
+++ b/lib/gpu/lal_coul_dsf_ext.cpp
@@ -49,7 +49,7 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -66,9 +66,9 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu
index fcc69c44ab..48a9681766 100644
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@@ -124,7 +124,8 @@ texture<int2> q_tex;
 #endif
 
 __kernel void k_coul_long(const __global numtyp4 *restrict x_, 
-                          const __global numtyp *restrict scale,
+                          const __global numtyp4 *restrict lj1,
+                          const __global numtyp4 *restrict lj3, 
                           const int lj_types,
                           const __global numtyp *restrict sp_cl_in, 
                           const __global int *dev_nbor,
@@ -153,44 +154,41 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    int itype=ix.w;
     numtyp qtmp; fetch(qtmp,i,q_tex);
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_coul;
       factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int jtype=jx.w;
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-      
-      int mtype=itype*lj_types+jtype;
+
       if (rsq < cut_coulsq) {
         numtyp r2inv=ucl_recip(rsq);
         numtyp force, prefactor, _erfc;
-        
+
         numtyp r = ucl_rsqrt(r2inv);
         numtyp grij = g_ewald * r;
         numtyp expm2 = ucl_exp(-grij*grij);
         numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
         _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
         fetch(prefactor,j,q_tex);
-        prefactor *= qqrd2e * scale[mtype] * qtmp/r;
+        prefactor *= qqrd2e * qtmp/r;
         force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
 
         f.x+=delx*force;
@@ -198,7 +196,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
         f.z+=delz*force;
 
         if (eflag>0) {
-	        e_coul += prefactor*(_erfc-factor_coul);
+	  e_coul += prefactor*(_erfc-factor_coul);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -217,7 +215,8 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
 }
 
 __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, 
-                               const __global numtyp *restrict scale_in,
+                               const __global numtyp4 *restrict lj1_in,
+                               const __global numtyp4 *restrict lj3_in,
                                const __global numtyp *restrict sp_cl_in,
                                const __global int *dev_nbor, 
                                const __global int *dev_packed,
@@ -231,14 +230,10 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_cl[4];
   if (tid<4)
     sp_cl[tid]=sp_cl_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    scale[tid]=scale_in[tid];
-  }
-  
+
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -249,27 +244,24 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
-    int iw = ix.w;
-    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-    
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_coul;
       factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int mtype=itype+jx.w;
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
@@ -280,13 +272,13 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
         numtyp r2inv=ucl_recip(rsq);
         numtyp force, prefactor, _erfc;
 
-        numtyp r = ucl_sqrt(rsq);
+        numtyp r = ucl_rsqrt(r2inv);
         numtyp grij = g_ewald * r;
         numtyp expm2 = ucl_exp(-grij*grij);
         numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
         _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
         fetch(prefactor,j,q_tex);
-        prefactor *= qqrd2e * scale[mtype] * qtmp/r;
+        prefactor *= qqrd2e * qtmp/r;
         force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;
 
         f.x+=delx*force;
@@ -294,7 +286,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
         f.z+=delz*force;
 
         if (eflag>0) {
-	        e_coul += prefactor*(_erfc-factor_coul);
+	  e_coul += prefactor*(_erfc-factor_coul);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_coul_long_ext.cpp b/lib/gpu/lal_coul_long_ext.cpp
index 49d593d9a0..5552dc2437 100644
--- a/lib/gpu/lal_coul_long_ext.cpp
+++ b/lib/gpu/lal_coul_long_ext.cpp
@@ -48,7 +48,7 @@ int cl_gpu_init(const int ntypes, double **host_scale,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -65,9 +65,9 @@ int cl_gpu_init(const int ntypes, double **host_scale,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 76000bd07b..2ef2ff4d59 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -48,7 +48,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
                          const int last_gpu, const int gpu_mode, 
                          const double p_split, const int nthreads, 
                          const int t_per_atom, const double cell_size,
-                         char *ocl_vendor) {
+                         char *ocl_vendor, const int block_pair) {
   _nthreads=nthreads;
   #ifdef _OPENMP
   omp_set_num_threads(nthreads);
@@ -66,6 +66,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
   _gpu_mode=gpu_mode;
   _particle_split=p_split;
   _cell_size=cell_size;
+  _block_pair=block_pair;
 
   // Get the rank/size within the world
   MPI_Comm_rank(_comm_world,&_world_me);
@@ -175,6 +176,12 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
   } else if (s_vendor=="cypress") {    
     _ocl_vendor_name="AMD Cypress";
     _ocl_vendor_string="-DCYPRESS_OCL";
+  } else if (s_vendor=="phi") {    
+    _ocl_vendor_name="Intel Phi";
+    _ocl_vendor_string="-DPHI_OCL";
+  } else if (s_vendor=="intel") {    
+    _ocl_vendor_name="Intel CPU";
+    _ocl_vendor_string="-DINTEL_OCL";
   } else if (s_vendor=="generic") {    
     _ocl_vendor_name="GENERIC";
     _ocl_vendor_string="-DGENERIC_OCL";
@@ -352,10 +359,10 @@ void DeviceT::init_message(FILE *screen, const char *name,
     for (int i=first_gpu; i<last; i++) {
       std::string sname;
       if (i==first_gpu)
-        sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
+        sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+
               toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+" GHZ (";
       else              
-        sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
+        sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+
               toa(gpu->clock_rate(i))+" GHZ (";
       if (sizeof(PRECISION)==4) {
         if (sizeof(ACC_PRECISION)==4)
@@ -520,7 +527,7 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
     if (screen && times[5]>0.0) {
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"    Device Time Info (average): ");
+      fprintf(screen,"      Device Time Info (average): ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
@@ -582,7 +589,7 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
     if (screen && times[6]>0.0) {
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"      Device Time Info (average): ");
+      fprintf(screen,"    Device Time Info (average): ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
@@ -672,7 +679,7 @@ int DeviceT::compile_kernels() {
     _threads_per_charge=gpu_lib_data[13];
   _pppm_max_spline=gpu_lib_data[4];
   _pppm_block=gpu_lib_data[5];
-  _block_pair=gpu_lib_data[6];
+  if (_block_pair == -1) _block_pair=gpu_lib_data[6];
   _max_shared_types=gpu_lib_data[7];
   _block_cell_2d=gpu_lib_data[8];
   _block_cell_id=gpu_lib_data[9];
@@ -714,10 +721,10 @@ int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
                     const int last_gpu, const int gpu_mode, 
                     const double particle_split, const int nthreads,
                     const int t_per_atom, const double cell_size, 
-                    char *opencl_vendor) {
+                    char *opencl_vendor, const int block_pair) {
   return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
                                    particle_split,nthreads,t_per_atom, 
-                                   cell_size,opencl_vendor);
+                                   cell_size,opencl_vendor,block_pair);
 }
 
 void lmp_clear_device() {
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index 9a767e74b3..77321f5462 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -51,7 +51,7 @@ class Device {
                   const int last_gpu, const int gpu_mode, 
                   const double particle_split, const int nthreads,
                   const int t_per_atom, const double cell_size, 
-                  char *vendor_string);
+                  char *vendor_string, const int block_pair);
 
   /// Initialize the device for Atom and Neighbor storage
   /** \param rot True if quaternions need to be stored
@@ -96,12 +96,12 @@ class Device {
                     const int first_gpu, const int last_gpu);
 
   /// Perform charge assignment asynchronously for PPPM
-	void set_single_precompute(PPPM<numtyp,acctyp,
-	                                         float,_lgpu_float4> *pppm);
+  void set_single_precompute(PPPM<numtyp,acctyp,
+                             float,_lgpu_float4> *pppm);
 
   /// Perform charge assignment asynchronously for PPPM
-	void set_double_precompute(PPPM<numtyp,acctyp,
-	                                         double,_lgpu_double4> *pppm);
+  void set_double_precompute(PPPM<numtyp,acctyp,
+                             double,_lgpu_double4> *pppm);
 
   /// Esimate the overhead from GPU calls from multiple procs
   /** \param kernel_calls Number of kernel calls/timestep for timing estimated
diff --git a/lib/gpu/lal_dipole_lj.cu b/lib/gpu/lal_dipole_lj.cu
index e5ca559dab..b6483d1ef8 100644
--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@@ -73,17 +73,17 @@ texture<int4,1> mu_tex;
     }                                                                       \
   }                                                                         \
   if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
     if (eflag>0) {                                                          \
-      *engv=energy*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
-      *engv=e_coul*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
+      engv[ei]=energy*(acctyp)0.5;                                             \
+      ei+=inum;                                                           \
+      engv[ei]=e_coul*(acctyp)0.5;                                             \
+      ei+=inum;                                                           \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                        \
+        ei+=inum;                                                         \
       }                                                                     \
     }                                                                       \
     ans[ii]=f;                                                              \
@@ -113,17 +113,17 @@ texture<int4,1> mu_tex;
     }                                                                       \
   }                                                                         \
   if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
     if (eflag>0) {                                                          \
-      *engv=energy*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
-      *engv=e_coul*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
+      engv[ei]=energy*(acctyp)0.5;                                             \
+      ei+=inum;                                                           \
+      engv[ei]=e_coul*(acctyp)0.5;                                             \
+      ei+=inum;                                                           \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                        \
+        ei+=inum;                                                         \
       }                                                                     \
     }                                                                       \
     ans[ii]=f;                                                              \
@@ -173,19 +173,19 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
@@ -385,11 +385,11 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
@@ -397,8 +397,8 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_dipole_lj_ext.cpp b/lib/gpu/lal_dipole_lj_ext.cpp
index d3174c3392..55bbe0b804 100644
--- a/lib/gpu/lal_dipole_lj_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_ext.cpp
@@ -50,7 +50,7 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -68,9 +68,9 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_dipole_lj_sf.cu b/lib/gpu/lal_dipole_lj_sf.cu
index e761b115f3..8469ed9ac9 100644
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@@ -74,17 +74,17 @@ texture<int4,1> mu_tex;
     }                                                                       \
   }                                                                         \
   if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
     if (eflag>0) {                                                          \
-      *engv=energy*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
-      *engv=e_coul*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
+      engv[ei]=energy*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
+      engv[ei]=e_coul*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+        ei+=inum;                                                           \
       }                                                                     \
     }                                                                       \
     ans[ii]=f;                                                              \
@@ -114,17 +114,17 @@ texture<int4,1> mu_tex;
     }                                                                       \
   }                                                                         \
   if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
     if (eflag>0) {                                                          \
-      *engv=energy*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
-      *engv=e_coul*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
+      engv[ei]=energy*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
+      engv[ei]=e_coul*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+        ei+=inum;                                                           \
       }                                                                     \
     }                                                                       \
     ans[ii]=f;                                                              \
@@ -174,19 +174,19 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
@@ -418,11 +418,11 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
@@ -430,8 +430,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_dipole_lj_sf_ext.cpp b/lib/gpu/lal_dipole_lj_sf_ext.cpp
index 8a24233649..8abf78c903 100644
--- a/lib/gpu/lal_dipole_lj_sf_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_sf_ext.cpp
@@ -50,7 +50,7 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -68,9 +68,9 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_dpd.cu b/lib/gpu/lal_dpd.cu
index b3fc5ef6e1..209bc0233e 100644
--- a/lib/gpu/lal_dpd.cu
+++ b/lib/gpu/lal_dpd.cu
@@ -187,11 +187,10 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
@@ -199,9 +198,9 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
     int itag=iv.w;
 
     numtyp factor_dpd;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_dpd = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -308,11 +307,10 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
@@ -321,9 +319,9 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
     int itag=iv.w;
 
     numtyp factor_dpd;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_dpd = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu
index 70301c7c5a..054b3ca6db 100644
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@@ -115,14 +115,15 @@ texture<int4> z2r_sp2_tex;
     }                                                                       \
   }                                                                         \
   if (offset==0) {                                                          \
+    int ei=ii;                                                              \
     if (eflag>0) {                                                          \
-      engv[ii]+=energy*(acctyp)0.5;                                         \
-      engv+=inum;                                                           \
+      engv[ei]+=energy*(acctyp)0.5;                                         \
+      ei+=inum;                                                             \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        engv[ii]=virial[i]*(acctyp)0.5;                                     \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+        ei+=inum;                                                           \
       }                                                                     \
     }                                                                       \
     ans[ii]=f;                                                              \
@@ -171,15 +172,15 @@ texture<int4> z2r_sp2_tex;
     }                                                                       \
   }                                                                         \
   if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
     if (eflag>0) {                                                          \
-      *engv+=energy*(acctyp)0.5;                                            \
-      engv+=inum;                                                           \
+      engv[ei]+=energy*(acctyp)0.5;                                         \
+      ei+=inum;                                                             \
     }                                                                       \
     if (vflag>0) {                                                          \
       for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+        ei+=inum;                                                           \
       }                                                                     \
     }                                                                       \
     ans[ii]=f;                                                              \
@@ -209,17 +210,17 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
   acctyp energy = (acctyp)0;
    
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -286,17 +287,17 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
   __syncthreads(); 
 
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -354,18 +355,18 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp ifp; fetch(ifp,i,fp_tex);  //fp_[i];
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -468,19 +469,19 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
   __syncthreads();
 
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp ifp; fetch(ifp,i,fp_tex); //fp_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp
index 57695555b1..d56f750e2f 100644
--- a/lib/gpu/lal_eam_ext.cpp
+++ b/lib/gpu/lal_eam_ext.cpp
@@ -58,7 +58,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -77,9 +77,9 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_ellipsoid_extra.h b/lib/gpu/lal_ellipsoid_extra.h
index f4697b157f..b33f087212 100644
--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@@ -31,14 +31,13 @@ texture<int4,1> pos_tex, quat_tex;
 #endif
 
 #define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset,           \
-                    i, numj, stride, list_end, nbor)                         \
-    nbor=nbor_mem+ii;                                                        \
-    i=*nbor;                                                                 \
-    nbor+=nbor_stride;                                                       \
-    numj=*nbor;                                                              \
-    nbor+=nbor_stride;                                                       \
-    list_end=nbor+fast_mul(nbor_stride,numj);                                \
-    nbor+=fast_mul(offset,nbor_stride);                                      \
+                    i, numj, stride, nbor_end, nbor_begin)                   \
+    i=nbor_mem[ii];                                                          \
+    nbor_begin=ii+nbor_stride;                                               \
+    numj=nbor_mem[nbor_begin];                                               \
+    nbor_begin+=nbor_stride;                                                 \
+    nbor_end=nbor_begin+fast_mul(nbor_stride,numj);                          \
+    nbor_begin+=fast_mul(offset,nbor_stride);                                \
     stride=fast_mul(t_per_atom,nbor_stride);
 
 #if (ARCH < 300)
diff --git a/lib/gpu/lal_ellipsoid_nbor.cu b/lib/gpu/lal_ellipsoid_nbor.cu
index 47ee173a4b..30d864aecc 100644
--- a/lib/gpu/lal_ellipsoid_nbor.cu
+++ b/lib/gpu/lal_ellipsoid_nbor.cu
@@ -41,20 +41,19 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
   int ii=GLOBAL_ID_X+start;
 
   if (ii<inum) {
-    const __global int *nbor=dev_ij+ii;
-    int i=*nbor;
+    int i=dev_ij[ii];
+    int nbor=ii+nbor_pitch;
+    int numj=dev_ij[nbor];
     nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
-    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
+    int nbor_end=nbor+fast_mul(numj,nbor_pitch);
+    int packed=ii+nbor_pitch+nbor_pitch;
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul(iw,ntypes);
     int newj=0;  
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      int j=dev_ij[nbor];
       j &= NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
@@ -70,7 +69,7 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
         rsq+=t*t;
 
         if (rsq<cf.x) {
-          *packed=j;
+          dev_nbor[packed]=j;
           packed+=nbor_pitch;
           newj++;
         }
@@ -105,21 +104,20 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    const __global int *nbor=dev_ij+ii;
-    int i=*nbor;
+    int i=dev_ij[ii];
+    int nbor=ii+nbor_pitch;
+    int numj=dev_ij[nbor];
     nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
-    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
+    int nbor_end=nbor+fast_mul(numj,nbor_pitch);
+    int packed=ii+nbor_pitch+nbor_pitch;
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     int newj=0;  
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      int j=dev_ij[nbor];
       j &= NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
@@ -135,7 +133,7 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
         rsq+=t*t;
 
         if (rsq<cutsq[mtype]) {
-          *packed=j;
+          dev_nbor[packed]=j;
           packed+=nbor_pitch;
           newj++;
         }
diff --git a/lib/gpu/lal_gauss.cu b/lib/gpu/lal_gauss.cu
index 3c2fe554df..6accf36a06 100644
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@@ -51,19 +51,19 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -138,20 +138,20 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_gauss_ext.cpp b/lib/gpu/lal_gauss_ext.cpp
index fdef451366..7c15a12591 100644
--- a/lib/gpu/lal_gauss_ext.cpp
+++ b/lib/gpu/lal_gauss_ext.cpp
@@ -48,7 +48,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -65,9 +65,9 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_gayberne.cu b/lib/gpu/lal_gayberne.cu
index 74072fc673..1a7e69eeba 100644
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@@ -119,7 +119,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    const __global int *nbor, *nbor_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
@@ -140,7 +140,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-      int j=*nbor;
+      int j=dev_nbor[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_gayberne_ext.cpp b/lib/gpu/lal_gayberne_ext.cpp
index 69ccb23b96..e674fb376b 100644
--- a/lib/gpu/lal_gayberne_ext.cpp
+++ b/lib/gpu/lal_gayberne_ext.cpp
@@ -52,7 +52,7 @@ int gb_gpu_init(const int ntypes, const double gamma,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -71,9 +71,9 @@ int gb_gpu_init(const int ntypes, const double gamma,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_gayberne_lj.cu b/lib/gpu/lal_gayberne_lj.cu
index c73ffa2e03..9b33b5f7f3 100644
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@@ -53,7 +53,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    const __global int *nbor, *nbor_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
@@ -68,7 +68,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_nbor[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -276,19 +276,19 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
-                n_stride,list_end,nbor);
+                n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_ij[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -371,20 +371,20 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
-                n_stride,list_end,nbor);
+                n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_ij[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_lj.cu b/lib/gpu/lal_lj.cu
index 11b3a31dad..9569cb0fd7 100644
--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@@ -28,7 +28,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
                    const __global numtyp4 *restrict lj1,
                    const __global numtyp4 *restrict lj3, 
                    const int lj_types, 
-                   const __global numtyp *restrict sp_lj_in, 
+                   const __global numtyp *restrict sp_lj, 
                    const __global int * dev_nbor, 
                    const __global int * dev_packed, 
                    __global acctyp4 *restrict ans, 
@@ -38,12 +38,6 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  __local numtyp sp_lj[4];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
-
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -52,19 +46,18 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -142,20 +135,19 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_lj96.cu b/lib/gpu/lal_lj96.cu
index e274934618..b219b8bf0d 100644
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@@ -52,19 +52,19 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -143,20 +143,20 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_lj96_ext.cpp b/lib/gpu/lal_lj96_ext.cpp
index 11b1ed184f..14c32ef95e 100644
--- a/lib/gpu/lal_lj96_ext.cpp
+++ b/lib/gpu/lal_lj96_ext.cpp
@@ -48,7 +48,7 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -65,9 +65,9 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_lj_class2_long.cu b/lib/gpu/lal_lj_class2_long.cu
index 2742a1f7b6..e16de3a327 100644
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@@ -65,18 +65,18 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
@@ -187,19 +187,19 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_lj_class2_long_ext.cpp b/lib/gpu/lal_lj_class2_long_ext.cpp
index 5134d149ff..4bb3aad7ad 100644
--- a/lib/gpu/lal_lj_class2_long_ext.cpp
+++ b/lib/gpu/lal_lj_class2_long_ext.cpp
@@ -51,7 +51,7 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -69,9 +69,9 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_lj_coul.cu b/lib/gpu/lal_lj_coul.cu
index cb225393fc..364203db22 100644
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@@ -65,18 +65,18 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
@@ -178,19 +178,19 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_lj_coul_debye.cu b/lib/gpu/lal_lj_coul_debye.cu
index f5825d5f3d..308504c6c8 100644
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@@ -66,18 +66,18 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
@@ -186,19 +186,19 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_lj_coul_debye_ext.cpp b/lib/gpu/lal_lj_coul_debye_ext.cpp
index 572f2bba13..67f5a0075f 100644
--- a/lib/gpu/lal_lj_coul_debye_ext.cpp
+++ b/lib/gpu/lal_lj_coul_debye_ext.cpp
@@ -51,7 +51,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -69,9 +69,9 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_lj_coul_ext.cpp b/lib/gpu/lal_lj_coul_ext.cpp
index 1c0499e578..3b5cc09805 100644
--- a/lib/gpu/lal_lj_coul_ext.cpp
+++ b/lib/gpu/lal_lj_coul_ext.cpp
@@ -50,7 +50,7 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -68,9 +68,9 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_lj_coul_long.cpp b/lib/gpu/lal_lj_coul_long.cpp
index 97ccfc4272..03f32a5fd0 100644
--- a/lib/gpu/lal_lj_coul_long.cpp
+++ b/lib/gpu/lal_lj_coul_long.cpp
@@ -80,7 +80,7 @@ int LJCoulLongT::init(const int ntypes,
 
   lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq, host_cut_ljsq);
+	   host_cutsq, host_cut_ljsq);
 
   lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
@@ -102,6 +102,23 @@ int LJCoulLongT::init(const int ntypes,
   return 0;
 }
 
+template <class numtyp, class acctyp>
+void LJCoulLongT::reinit(const int ntypes, double **host_cutsq, double **host_lj1,
+                         double **host_lj2, double **host_lj3, double **host_lj4,
+                         double **host_offset, double **host_cut_ljsq) {
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  
+  for (int i=0; i<_lj_types*_lj_types; i++)
+    host_write[i]=0.0;
+  
+  this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2,
+                         host_cutsq, host_cut_ljsq);
+  this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4,
+                         host_offset);
+}
+
 template <class numtyp, class acctyp>
 void LJCoulLongT::clear() {
   if (!_allocated)
diff --git a/lib/gpu/lal_lj_coul_long.cu b/lib/gpu/lal_lj_coul_long.cu
index 7d838b7c17..e0aa2e8a58 100644
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@@ -65,18 +65,18 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
@@ -183,19 +183,19 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_lj_coul_long.h b/lib/gpu/lal_lj_coul_long.h
index 4085f8eaef..2708cab05b 100644
--- a/lib/gpu/lal_lj_coul_long.h
+++ b/lib/gpu/lal_lj_coul_long.h
@@ -46,6 +46,11 @@ class LJCoulLong : public BaseCharge<numtyp, acctyp> {
            const double host_cut_coulsq, double *host_special_coul,
            const double qqrd2e, const double g_ewald);
 
+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **host_cutsq,
+              double **host_lj1, double **host_lj2, double **host_lj3,
+              double **host_lj4, double **host_offset, double **host_cut_ljsq);
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
diff --git a/lib/gpu/lal_lj_coul_long_ext.cpp b/lib/gpu/lal_lj_coul_long_ext.cpp
index b769446d43..dc93365f22 100644
--- a/lib/gpu/lal_lj_coul_long_ext.cpp
+++ b/lib/gpu/lal_lj_coul_long_ext.cpp
@@ -51,7 +51,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -69,9 +69,9 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
@@ -93,6 +93,29 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   return init_ok;
 }
 
+// ---------------------------------------------------------------------------
+// Copy updated coeffs from host to device
+// ---------------------------------------------------------------------------
+void ljcl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
+                    double **host_lj2, double **host_lj3, double **host_lj4,
+                    double **offset, double **host_cut_ljsq) {
+  int world_me=LJCLMF.device->world_me();
+  int gpu_rank=LJCLMF.device->gpu_rank();
+  int procs_per_gpu=LJCLMF.device->procs_per_gpu();
+  
+  if (world_me==0)
+    LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, 
+                  offset, host_cut_ljsq);
+  LJCLMF.device->world_barrier();
+  
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, 
+                    offset, host_cut_ljsq);
+    LJCLMF.device->gpu_barrier();
+  }
+}
+
 void ljcl_gpu_clear() {
   LJCLMF.clear();
 }
diff --git a/lib/gpu/lal_lj_coul_msm.cu b/lib/gpu/lal_lj_coul_msm.cu
index 5be8fe98f0..0c7c3cdace 100644
--- a/lib/gpu/lal_lj_coul_msm.cu
+++ b/lib/gpu/lal_lj_coul_msm.cu
@@ -118,11 +118,11 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
@@ -130,8 +130,8 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
     
     numtyp cut_coul = ucl_sqrt(cut_coulsq);
     
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
@@ -239,11 +239,11 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
@@ -252,8 +252,8 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
 
     numtyp cut_coul = ucl_sqrt(cut_coulsq);
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_lj_coul_msm_ext.cpp b/lib/gpu/lal_lj_coul_msm_ext.cpp
index ce10cac048..ecf3254cf9 100644
--- a/lib/gpu/lal_lj_coul_msm_ext.cpp
+++ b/lib/gpu/lal_lj_coul_msm_ext.cpp
@@ -51,7 +51,7 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -70,9 +70,9 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_lj_dsf.cu b/lib/gpu/lal_lj_dsf.cu
index e8f0f8e361..baec741e29 100644
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@@ -68,11 +68,11 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
@@ -84,8 +84,8 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
       e_coul += (acctyp)2.0*e_self;
     }
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul, r, prefactor, erfcc;
       factor_lj = sp_lj[sbmask(j)];
@@ -195,11 +195,11 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
@@ -212,8 +212,8 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
       e_coul += (acctyp)2.0*e_self;
     }
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul, r, prefactor, erfcc;
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_lj_dsf_ext.cpp b/lib/gpu/lal_lj_dsf_ext.cpp
index bd13d970fa..719a792d7f 100644
--- a/lib/gpu/lal_lj_dsf_ext.cpp
+++ b/lib/gpu/lal_lj_dsf_ext.cpp
@@ -52,7 +52,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -71,9 +71,9 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_lj_expand.cu b/lib/gpu/lal_lj_expand.cu
index 84f1007c58..6b79db2323 100644
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@@ -54,19 +54,19 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -147,20 +147,20 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_lj_expand_ext.cpp b/lib/gpu/lal_lj_expand_ext.cpp
index c1cc99fd12..5303149d1f 100644
--- a/lib/gpu/lal_lj_expand_ext.cpp
+++ b/lib/gpu/lal_lj_expand_ext.cpp
@@ -49,7 +49,7 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -66,9 +66,9 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_lj_ext.cpp b/lib/gpu/lal_lj_ext.cpp
index 674c1984bc..345ed4d955 100644
--- a/lib/gpu/lal_lj_ext.cpp
+++ b/lib/gpu/lal_lj_ext.cpp
@@ -48,7 +48,7 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -65,9 +65,9 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_lj_gromacs.cu b/lib/gpu/lal_lj_gromacs.cu
index b1e1d135c4..f20d8634a5 100644
--- a/lib/gpu/lal_lj_gromacs.cu
+++ b/lib/gpu/lal_lj_gromacs.cu
@@ -55,19 +55,18 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -161,20 +160,19 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_mie.cu b/lib/gpu/lal_mie.cu
index 8fd94bb768..4d718897eb 100644
--- a/lib/gpu/lal_mie.cu
+++ b/lib/gpu/lal_mie.cu
@@ -52,19 +52,19 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -143,20 +143,20 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_mie_ext.cpp b/lib/gpu/lal_mie_ext.cpp
index f9b36b5075..d7c4187a42 100644
--- a/lib/gpu/lal_mie_ext.cpp
+++ b/lib/gpu/lal_mie_ext.cpp
@@ -50,7 +50,7 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -68,9 +68,9 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_morse.cu b/lib/gpu/lal_morse.cu
index f1570dc1f7..2015c71cb2 100644
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@@ -54,19 +54,19 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -145,20 +145,20 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_morse_ext.cpp b/lib/gpu/lal_morse_ext.cpp
index 2fcf25cf23..3994473fd3 100644
--- a/lib/gpu/lal_morse_ext.cpp
+++ b/lib/gpu/lal_morse_ext.cpp
@@ -49,7 +49,7 @@ int mor_gpu_init(const int ntypes, double **cutsq,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -66,9 +66,9 @@ int mor_gpu_init(const int ntypes, double **cutsq,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp
index 877dbf4886..074eaa842b 100644
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@@ -65,6 +65,11 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
   else
     _alloc_packed=false;
 
+  if (pre_cut)
+    _packed_permissions=UCL_READ_WRITE;
+  else
+    _packed_permissions=UCL_READ_ONLY;
+
   bool success=true;
     
   // Initialize timers for the selected GPU
@@ -121,7 +126,7 @@ void Neighbor::alloc(bool &success) {
   if (_alloc_packed) {
     dev_packed.clear();
     success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
-                                         UCL_READ_ONLY)==UCL_SUCCESS);
+                                         _packed_permissions)==UCL_SUCCESS);
     _c_bytes+=dev_packed.row_bytes();                                         
   } 
   if (_max_host>0) {
diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h
index d54aa439b0..7653291bbb 100644
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@@ -227,6 +227,7 @@ class Neighbor {
   int _gpu_nbor, _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
   bool _gpu_host, _alloc_packed;
   double _cutoff, _cell_size, _bin_time;
+  enum UCL_MEMOPT _packed_permissions;
 
   double _gpu_bytes, _c_bytes, _cell_bytes;
   void alloc(bool &success);
diff --git a/lib/gpu/lal_neighbor_cpu.cu b/lib/gpu/lal_neighbor_cpu.cu
index 1a1d392032..384b88d9de 100644
--- a/lib/gpu/lal_neighbor_cpu.cu
+++ b/lib/gpu/lal_neighbor_cpu.cu
@@ -25,17 +25,17 @@ __kernel void kernel_unpack(__global int *dev_nbor,
   int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
 
   if (ii<inum) {
-    __global int *nbor=dev_nbor+ii+inum;
-    int numj=*nbor;
+    int nbor=ii+inum;
+    int numj=dev_nbor[nbor];
     nbor+=inum;
-    const __global int *list=dev_ij+*nbor;
-    const __global int *list_end=list+numj;
+    int list=dev_nbor[nbor];
+    int list_end=list+numj;
     list+=offset;
     nbor+=fast_mul(ii,t_per_atom-1)+offset;
     int stride=fast_mul(t_per_atom,inum);
       
     for ( ; list<list_end; list++) {
-      *nbor=*list;
+      dev_nbor[nbor]=dev_ij[list];
       nbor+=stride;
     }
   } // if ii
diff --git a/lib/gpu/lal_pppm.cpp b/lib/gpu/lal_pppm.cpp
index 188d23096c..3685cf9a89 100644
--- a/lib/gpu/lal_pppm.cpp
+++ b/lib/gpu/lal_pppm.cpp
@@ -160,7 +160,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
 
   // Allocate error flags for checking out of bounds atoms
   success=success && (error_flag.alloc(1,*ucl_device,UCL_READ_ONLY,
-                                       UCL_WRITE_ONLY)==UCL_SUCCESS);
+                                       UCL_READ_WRITE)==UCL_SUCCESS);
   if (!success) {
     flag=-3;
     return 0;
diff --git a/lib/gpu/lal_pppm_ext.cpp b/lib/gpu/lal_pppm_ext.cpp
index 08f2c94e90..6e5a82af5b 100644
--- a/lib/gpu/lal_pppm_ext.cpp
+++ b/lib/gpu/lal_pppm_ext.cpp
@@ -51,7 +51,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -69,9 +69,9 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index e328108600..9dbb3c5944 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -333,6 +333,54 @@ inline double shfl_xor(double var, int laneMask, int width) {
 
 #endif
 
+// -------------------------------------------------------------------------
+//                           INTEL CPU OPENCL DEFINITIONS
+// -------------------------------------------------------------------------
+
+#ifdef INTEL_OCL
+
+#define USE_OPENCL
+#define MEM_THREADS 16
+#define THREADS_PER_ATOM 1
+#define THREADS_PER_CHARGE 1
+#define BLOCK_PAIR 1
+#define MAX_SHARED_TYPES 0
+#define BLOCK_NBOR_BUILD 4
+#define BLOCK_BIO_PAIR 2
+#define BLOCK_ELLIPSE 2
+
+#define WARP_SIZE 1
+#define PPPM_BLOCK_1D 32
+#define BLOCK_CELL_2D 1
+#define BLOCK_CELL_ID 2
+#define MAX_BIO_SHARED_TYPES 0
+
+#endif
+
+// -------------------------------------------------------------------------
+//                           INTEL PHI OPENCL DEFINITIONS
+// -------------------------------------------------------------------------
+
+#ifdef PHI_OCL
+
+#define USE_OPENCL
+#define MEM_THREADS 16
+#define THREADS_PER_ATOM 1
+#define THREADS_PER_CHARGE 1
+#define BLOCK_PAIR 16
+#define MAX_SHARED_TYPES 0
+#define BLOCK_NBOR_BUILD 16
+#define BLOCK_BIO_PAIR 16
+#define BLOCK_ELLIPSE 16
+
+#define WARP_SIZE 1
+#define PPPM_BLOCK_1D 32
+#define BLOCK_CELL_2D 4
+#define BLOCK_CELL_ID 16
+#define MAX_BIO_SHARED_TYPES 0
+
+#endif
+
 // -------------------------------------------------------------------------
 //                            GENERIC OPENCL DEFINITIONS
 // -------------------------------------------------------------------------
@@ -433,7 +481,9 @@ inline double shfl_xor(double var, int laneMask, int width) {
 //                  ARCHITECTURE INDEPENDENT DEFINITIONS
 // -------------------------------------------------------------------------
 
+#ifndef PPPM_MAX_SPLINE
 #define PPPM_MAX_SPLINE 8
+#endif
 
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu
index 1308a70a7f..3a65ce14ce 100644
--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@@ -74,7 +74,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    const __global int *nbor, *nbor_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
@@ -125,7 +125,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-      int j=*nbor;
+      int j=dev_nbor[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_re_squared_ext.cpp b/lib/gpu/lal_re_squared_ext.cpp
index b010f0f21c..e1d8fffb8f 100644
--- a/lib/gpu/lal_re_squared_ext.cpp
+++ b/lib/gpu/lal_re_squared_ext.cpp
@@ -50,7 +50,7 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -68,9 +68,9 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_re_squared_lj.cu b/lib/gpu/lal_re_squared_lj.cu
index 71dc0e536b..4742e5bd8e 100644
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@@ -172,7 +172,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    const __global int *nbor, *nbor_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
@@ -199,7 +199,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-      int j=*nbor;
+      int j=dev_nbor[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -424,7 +424,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    const __global int *nbor, *nbor_end;
+    int nbor, nbor_end;
     int j, numj;
     __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
@@ -435,7 +435,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-      int i=*nbor;
+      int i=dev_nbor[nbor];
       factor_lj = sp_lj[sbmask(i)];
       i &= NEIGHMASK;
 
@@ -616,19 +616,19 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
-                n_stride,list_end,nbor);
+                n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_ij[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -710,20 +710,20 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
-                n_stride,list_end,nbor);
+                n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_ij[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_soft.cu b/lib/gpu/lal_soft.cu
index a0c677973e..b7c32b6879 100644
--- a/lib/gpu/lal_soft.cu
+++ b/lib/gpu/lal_soft.cu
@@ -53,19 +53,19 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -141,20 +141,20 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_soft_ext.cpp b/lib/gpu/lal_soft_ext.cpp
index 5e7f38d054..9591923965 100644
--- a/lib/gpu/lal_soft_ext.cpp
+++ b/lib/gpu/lal_soft_ext.cpp
@@ -48,7 +48,7 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -65,9 +65,9 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu
index c99aa3f675..035c05672a 100644
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@@ -156,18 +156,18 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
     
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -195,17 +195,17 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
         numtyp sw_cut=sw3_ijparam.x;
         numtyp sw_cutsq=sw3_ijparam.y;
         numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
-            ucl_powr(sw_sigma,sw_powerp);
+            pow(sw_sigma,sw_powerp);
         numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
-            ucl_powr(sw_sigma,sw_powerq);
+            pow(sw_sigma,sw_powerq);
         numtyp pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
-            ucl_powr(sw_sigma,sw_powerp+(numtyp)1.0);
+            pow(sw_sigma,sw_powerp+(numtyp)1.0);
         numtyp pre_sw_c4=sw_biga*sw_epsilon*
-            ucl_powr(sw_sigma,sw_powerq+(numtyp)1.0);
+            pow(sw_sigma,sw_powerq+(numtyp)1.0);
         numtyp pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
-            ucl_powr(sw_sigma,sw_powerp);
+            pow(sw_sigma,sw_powerp);
         numtyp pre_sw_c6=sw_biga*sw_epsilon*
-            ucl_powr(sw_sigma,sw_powerq);
+            pow(sw_sigma,sw_powerq);
 
         numtyp r=ucl_sqrt(rsq);
         numtyp rp=ucl_powr(r,-sw_powerp);
@@ -343,6 +343,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
                                 const int t_per_atom, const int evatom) {
   __local int tpa_sq, n_stride;
   tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
   numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
   numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
 
@@ -359,21 +360,20 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor_j, *list_end;
-    int i, numj;
+    int i, numj, nbor_j, nbor_end;
 
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,list_end,nbor_j);
+              n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w; 
     itype=map[itype];
 
-    for ( ; nbor_j<list_end; nbor_j+=n_stride) {
+    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
   
-      int j=*nbor_j;
+      int j=dev_packed[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -392,15 +392,17 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
       if (rsq1 > sw3_ijparam.y) continue;
 
       numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
+      sw_sigma=sw1_ijparam.y;
+      sw_gamma=sw1_ijparam.w;
       sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
       sw_cut_ij=sw3_ijparam.x;
 
-      const __global int *nbor_k=nbor_j-offset_j+offset_k;
+      int nbor_k=nbor_j-offset_j+offset_k;
       if (nbor_k<=nbor_j)
         nbor_k+=n_stride;
 
-      for ( ; nbor_k<list_end; nbor_k+=n_stride) {
-        int k=*nbor_k;
+      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
+        int k=dev_packed[nbor_k];
         k &= NEIGHMASK;
 
         numtyp4 kx; fetch4(kx,k,pos_tex);
@@ -415,11 +417,15 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
         numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
         if (rsq2 < sw3_ikparam.y) {   // sw_cutsq=sw3[ikparam].y;
           numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
+          sw_sigma=sw1_ikparam.y;
+          sw_gamma=sw1_ikparam.w;
           sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
           sw_cut_ik=sw3_ikparam.x;
 
           int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
           numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
+          sw_epsilon=sw1_ijkparam.x;
+          sw_lambda=sw1_ijkparam.z;
           sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
           sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
           numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
@@ -466,6 +472,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
                              const int t_per_atom) {
   __local int tpa_sq, n_stride;
   tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
   numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
   numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
 
@@ -482,20 +489,19 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor_j, *list_end, *k_end;
-    int i, numj;
+    int i, numj, nbor_j, nbor_end, k_end;
 
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,list_end,nbor_j);
+              n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    for ( ; nbor_j<list_end; nbor_j+=n_stride) {
-      int j=*nbor_j;
+    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
+      int j=dev_packed[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -513,27 +519,27 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
 
       if (rsq1 > sw3_ijparam.y) continue;
 
-      int jiparam=elem2param[jtype*nelements*nelements+itype*nelements+itype]; 
-      numtyp4 sw1_jiparam; fetch4(sw1_jiparam,jiparam,sw1_tex);
-      numtyp4 sw3_jiparam; fetch4(sw3_jiparam,jiparam,sw3_tex);
-      sw_sigma_gamma_ij=sw1_jiparam.y*sw1_jiparam.w; //sw_sigma*sw_gamma;
-      sw_cut_ij=sw3_jiparam.x;
+      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
+      sw_sigma=sw1_ijparam.y;
+      sw_gamma=sw1_ijparam.w;
+      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
+      sw_cut_ij=sw3_ijparam.x;
 
-      const __global int *nbor_k=dev_nbor+j+nbor_pitch;
-      int numk=*nbor_k;
+      int nbor_k=j+nbor_pitch;
+      int numk=dev_nbor[nbor_k]; 
       if (dev_nbor==dev_packed) {
         nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
         k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
         nbor_k+=offset_k;
       } else {
         nbor_k+=nbor_pitch;
-        nbor_k=dev_packed+*nbor_k;
+        nbor_k=dev_nbor[nbor_k];
         k_end=nbor_k+numk;
         nbor_k+=offset_k;
       }
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=*nbor_k;
+        int k=dev_packed[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -541,25 +547,29 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
         numtyp4 kx; fetch4(kx,k,pos_tex);
         int ktype=kx.w;
         ktype=map[ktype];
-        int jkparam=elem2param[jtype*nelements*nelements+ktype*nelements+ktype]; 
+        int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype]; 
 
         numtyp delr2x = kx.x - jx.x;
         numtyp delr2y = kx.y - jx.y;
         numtyp delr2z = kx.z - jx.z;
         numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
-        numtyp4 sw3_jkparam; fetch4(sw3_jkparam,jkparam,sw3_tex);
+        numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
 
-        if (rsq2 < sw3_jkparam.y) {
-          numtyp4 sw1_jkparam; fetch4(sw1_jkparam,jkparam,sw1_tex);
-          sw_sigma_gamma_ik=sw1_jkparam.y*sw1_jkparam.w; //sw_sigma*sw_gamma;
-          sw_cut_ik=sw3_jkparam.x;
+        if (rsq2 < sw3_ikparam.y) {
+          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
+          sw_sigma=sw1_ikparam.y;
+          sw_gamma=sw1_ikparam.w;
+          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
+          sw_cut_ik=sw3_ikparam.x;
 
-          int jikparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype];
-          numtyp4 sw1_jikparam; fetch4(sw1_jikparam,jikparam,sw1_tex);
-          sw_lambda_epsilon_ijk=sw1_jikparam.x*sw1_jikparam.z; //sw_lambda*sw_epsilon;
+          int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
+          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
+          sw_epsilon=sw1_ijkparam.x;
+          sw_lambda=sw1_ijkparam.z;
+          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
           sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
-          numtyp4 sw3_jikparam; fetch4(sw3_jikparam,jikparam,sw3_tex);
-          sw_costheta_ijk=sw3_jikparam.z;
+          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
+          sw_costheta_ijk=sw3_ijkparam.z;
 
           numtyp fjx, fjy, fjz;
           //if (evatom==0) {
@@ -602,6 +612,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
                              const int t_per_atom) {
   __local int tpa_sq, n_stride;
   tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
   numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
   numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
 
@@ -618,20 +629,19 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor_j, *list_end, *k_end;
-    int i, numj;
+    int i, numj, nbor_j, nbor_end, k_end;
 
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,list_end,nbor_j);
+              n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    for ( ; nbor_j<list_end; nbor_j+=n_stride) {
-      int j=*nbor_j;
+    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
+      int j=dev_packed[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -649,27 +659,27 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
 
       if (rsq1 > sw3_ijparam.y) continue;
 
-      int jiparam=elem2param[jtype*nelements*nelements+itype*nelements+itype]; 
-      numtyp4 sw1_jiparam; fetch4(sw1_jiparam,jiparam,sw1_tex);
-      numtyp4 sw3_jiparam; fetch4(sw3_jiparam,jiparam,sw3_tex);
-      sw_sigma_gamma_ij=sw1_jiparam.y*sw1_jiparam.w; //sw_sigma*sw_gamma;
-      sw_cut_ij=sw3_jiparam.x;
-       
-      const __global int *nbor_k=dev_nbor+j+nbor_pitch;
-      int numk=*nbor_k;
+      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
+      sw_sigma=sw1_ijparam.y;
+      sw_gamma=sw1_ijparam.w;
+      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
+      sw_cut_ij=sw3_ijparam.x;
+        
+      int nbor_k=j+nbor_pitch;
+      int numk=dev_nbor[nbor_k]; 
       if (dev_nbor==dev_packed) {
         nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
         k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
         nbor_k+=offset_k;
       } else {
         nbor_k+=nbor_pitch;
-        nbor_k=dev_packed+*nbor_k;
+        nbor_k=dev_nbor[nbor_k];
         k_end=nbor_k+numk;
         nbor_k+=offset_k;
       }
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=*nbor_k;
+        int k=dev_packed[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -677,25 +687,29 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
         numtyp4 kx; fetch4(kx,k,pos_tex);
         int ktype=kx.w;
         ktype=map[ktype];
-        int jkparam=elem2param[jtype*nelements*nelements+ktype*nelements+ktype]; 
-        numtyp4 sw3_jkparam; fetch4(sw3_jkparam,jkparam,sw3_tex);
+        int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype]; 
+        numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
 
         numtyp delr2x = kx.x - jx.x;
         numtyp delr2y = kx.y - jx.y;
         numtyp delr2z = kx.z - jx.z;
         numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
 
-        if (rsq2 < sw3_jkparam.y) {
-          numtyp4 sw1_jkparam; fetch4(sw1_jkparam,jkparam,sw1_tex);
-          sw_sigma_gamma_ik=sw1_jkparam.y*sw1_jkparam.w; //sw_sigma*sw_gamma;
-          sw_cut_ik=sw3_jkparam.x;
+        if (rsq2 < sw3_ikparam.y) {
+          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
+          sw_sigma=sw1_ikparam.y;
+          sw_gamma=sw1_ikparam.w;
+          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
+          sw_cut_ik=sw3_ikparam.x;
 
-          int jikparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype];
-          numtyp4 sw1_jikparam; fetch4(sw1_jikparam,jikparam,sw1_tex);
-          sw_lambda_epsilon_ijk=sw1_jikparam.x*sw1_jikparam.z; //sw_lambda*sw_epsilon;
+          int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
+          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
+          sw_epsilon=sw1_ijkparam.x;
+          sw_lambda=sw1_ijkparam.z;
+          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
           sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
-          numtyp4 sw3_jikparam; fetch4(sw3_jikparam,jikparam,sw3_tex);
-          sw_costheta_ijk=sw3_jikparam.z;
+          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
+          sw_costheta_ijk=sw3_ijkparam.z;
 
           numtyp fjx, fjy, fjz, fkx, fky, fkz;
           threebody(delr1x,delr1y,delr1z,eflag,energy);
diff --git a/lib/gpu/lal_table.cu b/lib/gpu/lal_table.cu
index afc6267902..1033b7fbb8 100644
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@@ -73,19 +73,19 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
   int tlm1 = tablength - 1;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -171,20 +171,20 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
   int tlm1 = tablength - 1;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
     
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -269,19 +269,19 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
   int tlm1 = tablength - 1;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -371,20 +371,20 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
   int tlm1 = tablength - 1;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
     
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -473,19 +473,19 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
   int tlm1 = tablength - 1;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -582,20 +582,20 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
   int tlm1 = tablength - 1;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
     
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -693,19 +693,19 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
   int tlm1 = tablength - 1;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -800,20 +800,20 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
   int tlm1 = tablength - 1;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
     
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_table_ext.cpp b/lib/gpu/lal_table_ext.cpp
index 8e030ab4ba..172acb7d39 100644
--- a/lib/gpu/lal_table_ext.cpp
+++ b/lib/gpu/lal_table_ext.cpp
@@ -48,7 +48,7 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -65,9 +65,9 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu
index 0caaa0bd10..b0c3b9978d 100644
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@@ -51,19 +51,19 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -140,20 +140,20 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_yukawa_colloid.cu b/lib/gpu/lal_yukawa_colloid.cu
index 65da63f8f7..f9f4767123 100644
--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@@ -58,20 +58,20 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp radi; fetch(radi,i,rad_tex);
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
@@ -150,11 +150,11 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
   
   if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp radi; fetch(radi,i,rad_tex);
@@ -162,9 +162,9 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
-      int j=*nbor;
+      int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_yukawa_colloid_ext.cpp b/lib/gpu/lal_yukawa_colloid_ext.cpp
index f259152e2b..0e3c653e06 100644
--- a/lib/gpu/lal_yukawa_colloid_ext.cpp
+++ b/lib/gpu/lal_yukawa_colloid_ext.cpp
@@ -48,7 +48,7 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -65,9 +65,9 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
diff --git a/lib/gpu/lal_yukawa_ext.cpp b/lib/gpu/lal_yukawa_ext.cpp
index 79f12d11ba..1cc89885aa 100644
--- a/lib/gpu/lal_yukawa_ext.cpp
+++ b/lib/gpu/lal_yukawa_ext.cpp
@@ -48,7 +48,7 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
@@ -65,9 +65,9 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }