git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12655 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2014-10-29 15:47:24 +00:00 · 2014-10-29 15:47:24 +00:00 · 7510ec7960
parent c21dee6f53
commit 7510ec7960
95 changed files with 850 additions and 745 deletions
--- a/lib/gpu/geryon/VERSION.txt
+++ b/lib/gpu/geryon/VERSION.txt
@ -1 +1 @@
-Geryon Version 13.209
+Geryon Version 13.234
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -159,6 +159,12 @@ class UCL_Device {
  /// Returns true if double precision is support for the device
  inline bool double_precision(const int i) {return arch(i)>=1.3;}
  
+  /// Get the number of compute units on the current device
+  inline unsigned cus() { return cus(_device); }
+  /// Get the number of compute units
+  inline unsigned cus(const int i) 
+    { return _properties[i].multiProcessorCount; }
+
  /// Get the number of cores in the current device
  inline unsigned cores() { return cores(_device); }
  /// Get the number of cores
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@ -185,13 +185,12 @@ class UCL_Device {
  inline bool double_precision(const int i) 
    {return _properties[i].double_precision;}
   
-  /// Get the number of cores in the current device
-  inline unsigned cores() { return cores(_device); }
-  /// Get the number of cores
-  inline unsigned cores(const int i) 
-    { if (device_type(i)==UCL_CPU) return _properties[i].compute_units;
-      else return _properties[i].compute_units*8; }
-  
+  /// Get the number of compute units on the current device
+  inline unsigned cus() { return cus(_device); }
+  /// Get the number of compute units
+  inline unsigned cus(const int i) 
+    { return _properties[i].compute_units; }
+
  /// Get the gigabytes of global memory in the current device
  inline double gigabytes() { return gigabytes(_device); }
  /// Get the gigabytes of global memory
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@ -127,7 +127,8 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
  orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
  
  mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
-                              mat.host_ptr(), &error_flag);
+                              *mat.host_ptr(), &error_flag);
+
  CL_CHECK_ERR(error_flag);
  CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
  return UCL_SUCCESS;
@ -174,7 +175,7 @@ template <class mat_type>
 inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) {
  cl_int error_flag;
  mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
-                              n,mat.host_ptr(),&error_flag);
+                              n,*mat.host_ptr(),&error_flag);
  CL_CHECK_ERR(error_flag);
  CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
  return UCL_SUCCESS;
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@ -221,7 +221,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
    return energy_virial(eatom,vatom,virial);

  double evdwl=0.0;
-  int vstart=0, iend=_inum;
+  int ii, vstart=0, iend=_inum;
  if (_eflag) {
    iend=_inum*2;
    for (int i=0; i<_inum; i++)
@ -235,10 +235,10 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
        for (int i=_inum; i<iend; i++)
          eatom[i]+=engv[i];
      } else {
-        for (int i=0; i<_inum; i++)
-          eatom[_ilist[i]]+=engv[i];
-        for (int i=_inum; i<iend; i++)
-          eatom[_ilist[i]]+=engv[i];
+        for (int i=0, ii=0; i<_inum; i++)
+          eatom[_ilist[ii++]]+=engv[i];
+        for (int i=_inum, ii=0; i<iend; i++)
+          eatom[_ilist[ii++]]+=engv[i];
      }
    vstart=iend;
    iend+=_inum;
@ -249,12 +249,10 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
        virial[j]+=engv[i];
      if (_vf_atom)
        if (_ilist==NULL) {
-          int ii=0;
          for (int i=vstart; i<iend; i++)
-            vatom[ii++][j]+=engv[i];
+            vatom[i++][j]+=engv[i];
        } else {
-          int ii=0;
-          for (int i=vstart; i<iend; i++)
+          for (int i=vstart, ii=0; i<iend; i++)
            vatom[_ilist[ii++]][j]+=engv[i];
        }  
      vstart+=_inum;
--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@ -23,22 +23,21 @@
  ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;

 #define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \
-                  i, numj, stride, list_end, nbor)                           \
-  nbor=nbor_mem+ii;                                                          \
-  i=*nbor;                                                                   \
-  nbor+=nbor_stride;                                                         \
-  numj=*nbor;                                                                \
+                  i, numj, stride, nbor_end, nbor_begin)                     \
+  i=nbor_mem[ii];                                                            \
+  nbor_begin=ii+nbor_stride;                                                 \
+  numj=nbor_mem[nbor_begin];                                                 \
  if (nbor_mem==packed_mem) {                                                \
-    nbor+=nbor_stride+fast_mul(ii,t_per_atom-1);                             \
+    nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1);                       \
    stride=fast_mul(t_per_atom,nbor_stride);                                 \
-    list_end=nbor+fast_mul(numj/t_per_atom,stride)+ (numj & (t_per_atom-1)); \
-    nbor+=offset;                                                            \
+    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \
+    nbor_begin+=offset;                                                      \
  } else {                                                                   \
-    nbor+=nbor_stride;                                                       \
-    nbor=packed_mem+*nbor;                                                   \
-    list_end=nbor+numj;                                                      \
+    nbor_begin+=nbor_stride;                                                 \
+    nbor_begin=nbor_mem[nbor_begin];                                         \
+    nbor_end=nbor_begin+numj;                                                \
    stride=t_per_atom;                                                       \
-    nbor+=offset;                                                            \
+    nbor_begin+=offset;                                                      \
  }

 #if (ARCH < 300)
@ -75,15 +74,15 @@
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
    if (eflag>0) {                                                          \
-      *engv=energy*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
+      engv[ei]=energy*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+        ei+=inum;                                                           \
      }                                                                     \
    }                                                                       \
    ans[ii]=f;                                                              \
@ -123,17 +122,17 @@
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
    if (eflag>0) {                                                          \
-      *engv=energy*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
-      *engv=e_coul*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
+      engv[ei]=energy*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
+      engv[ei]=e_coul*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+        ei+=inum;                                                           \
      }                                                                     \
    }                                                                       \
    ans[ii]=f;                                                              \
@ -158,15 +157,15 @@
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
    if (eflag>0) {                                                          \
-      *engv=energy*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
+      engv[ei]=energy*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+        ei+=inum;                                                           \
      }                                                                     \
    }                                                                       \
    ans[ii]=f;                                                              \
@ -190,17 +189,17 @@
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
    if (eflag>0) {                                                          \
-      *engv=energy*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
-      *engv=e_coul*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
+      engv[ei]=energy*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
+      engv[ei]=e_coul*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+        ei+=inum;                                                           \
      }                                                                     \
    }                                                                       \
    ans[ii]=f;                                                              \
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@ -202,6 +202,7 @@ void BaseEllipsoidT::output_times() {
  MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
             device->replica());
  double max_mb=mpi_max_bytes/(1024*1024);
+  double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];

  if (device->replica_me()==0)
    if (screen && times[5]>0.0) {
@ -209,11 +210,11 @@ void BaseEllipsoidT::output_times() {

      fprintf(screen,"\n\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"      GPU Time Info (average): ");
+      fprintf(screen,"    Device Time Info (average): ");
      fprintf(screen,"\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");

-      if (device->procs_per_gpu()==1) {
+      if (device->procs_per_gpu()==1 && t_time>0) {
        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/replica_size);
        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[5]/replica_size);
        fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/replica_size);
@ -226,7 +227,8 @@ void BaseEllipsoidT::output_times() {
      }
      if (nbor->gpu_nbor()==2)
        fprintf(screen,"Neighbor (CPU):  %.4f s.\n",times[9]/replica_size);
-      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[6]/replica_size);
+      if (times[6]>0)
+        fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
      fprintf(screen,"Average split:   %.4f.\n",avg_split);
      fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);      
      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
--- a/lib/gpu/lal_beck.cu
+++ b/lib/gpu/lal_beck.cu
@ -52,19 +52,19 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -154,20 +154,20 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_beck_ext.cpp
+++ b/lib/gpu/lal_beck_ext.cpp
@ -48,7 +48,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -65,9 +65,9 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@ -53,19 +53,19 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -148,20 +148,20 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@ -66,18 +66,18 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
@ -188,19 +188,19 @@ __kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_born_coul_long_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_ext.cpp
@ -52,7 +52,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -71,9 +71,9 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@ -69,11 +69,11 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
@ -85,8 +85,8 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
      e_coul += (acctyp)2.0*e_self;
    }

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
@ -105,7 +105,7 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
      int mtype=itype*lj_types+jtype;
      if (rsq<cutsq_sigma[mtype].x) { // cutsq
        numtyp r2inv = ucl_recip(rsq);
-        numtyp forcecoul, forceborn, force, r6inv, prefactor, erfcc;
+        numtyp forcecoul, forceborn, force, r6inv, prefactor;
        numtyp v_sh = (numtyp)0.0;
        numtyp rexp = (numtyp)0.0;
        
@ -124,10 +124,11 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
          fetch(prefactor,j,q_tex);
          prefactor *= qqrd2e * qtmp/r;

-          erfcc = erfc(arij);
+          const numtyp erfcc = erfc(arij);
          v_sh = (erfcc - e_shift*r)*prefactor;
          numtyp dvdrr = (erfcc/rsq + (numtyp)2.0*alf/MY_PIS * erfcd/r) + f_shift;
-          forcecoul = prefactor * dvdrr*rsq*factor_coul;
+          forcecoul = prefactor * dvdrr*rsq;
+          if (factor_coul < (numtyp)1.0) forcecoul -= ((numtyp)1.0-factor_coul)*prefactor;
        } else forcecoul = (numtyp)0.0;

        force = (forceborn + forcecoul) * r2inv;
@ -137,8 +138,11 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
        f.z+=delz*force;

        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += v_sh*factor_coul;
+          if (rsq < cut_coulsq) {
+            numtyp e=v_sh;
+            if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
+            e_coul += e;
+          }
          if (rsq < cutsq_sigma[mtype].y) {
            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
              + coeff2[mtype].z*r2inv*r6inv;
@ -201,11 +205,11 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
@ -218,8 +222,8 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
      e_coul += (acctyp)2.0*e_self;
    }

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
@ -237,7 +241,7 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,

      if (rsq<cutsq_sigma[mtype].x) {
        numtyp r2inv=ucl_recip(rsq);
-        numtyp forcecoul, forceborn, force, r6inv, prefactor, erfcc;
+        numtyp forcecoul, forceborn, force, r6inv, prefactor;
        numtyp v_sh = (numtyp)0.0;
        numtyp rexp = (numtyp)0.0;
        
@ -256,10 +260,11 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
          fetch(prefactor,j,q_tex);
          prefactor *= qqrd2e * qtmp/r;

-          erfcc = erfc(arij);
+          const numtyp erfcc = erfc(arij);
          v_sh = (erfcc - e_shift*r)*prefactor;
          numtyp dvdrr = (erfcc/rsq + (numtyp)2.0*alf/MY_PIS * erfcd/r) + f_shift;
-          forcecoul = prefactor * dvdrr*rsq*factor_coul;
+          forcecoul = prefactor * dvdrr*rsq;
+          if (factor_coul < (numtyp)1.0) forcecoul -= ((numtyp)1.0-factor_coul)*prefactor;
        } else forcecoul = (numtyp)0.0;

        force = (forceborn + forcecoul) * r2inv;
@ -269,8 +274,11 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
        f.z+=delz*force;

        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += v_sh*factor_coul;
+          if (rsq < cut_coulsq) {
+            numtyp e=v_sh;
+            if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
+            e_coul += e;
+          }
          if (rsq < cutsq_sigma[mtype].y) {
            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
              + coeff2[mtype].z*r2inv*r6inv;
--- a/lib/gpu/lal_born_coul_wolf_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_ext.cpp
@ -52,7 +52,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -72,9 +72,9 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_born_ext.cpp
+++ b/lib/gpu/lal_born_ext.cpp
@ -50,7 +50,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -68,9 +68,9 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@ -52,19 +52,19 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -145,20 +145,20 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@ -65,18 +65,18 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;
    
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
      
      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
@ -182,19 +182,19 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
      
      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_buck_coul_ext.cpp
+++ b/lib/gpu/lal_buck_coul_ext.cpp
@ -51,7 +51,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -70,9 +70,9 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@ -66,18 +66,18 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
@ -190,19 +190,19 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_buck_coul_long_ext.cpp
+++ b/lib/gpu/lal_buck_coul_long_ext.cpp
@ -52,7 +52,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -70,9 +70,9 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_buck_ext.cpp
+++ b/lib/gpu/lal_buck_ext.cpp
@ -49,7 +49,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -66,9 +66,9 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_cg_cmm.cu
+++ b/lib/gpu/lal_cg_cmm.cu
@ -52,19 +52,19 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -150,20 +150,20 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_cg_cmm_ext.cpp
+++ b/lib/gpu/lal_cg_cmm_ext.cpp
@ -49,7 +49,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -66,9 +66,9 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_cg_cmm_long.cu
+++ b/lib/gpu/lal_cg_cmm_long.cu
@ -65,18 +65,18 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
@ -191,19 +191,19 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_cg_cmm_long_ext.cpp
+++ b/lib/gpu/lal_cg_cmm_long_ext.cpp
@ -51,7 +51,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -69,9 +69,9 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@ -66,13 +66,14 @@ int CHARMMLongT::init(const int ntypes,
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
-  if (this->_block_bio_size>=64 && mix_arithmetic)
+  int max_bio_shared_types=this->device->max_bio_shared_types();
+  if (this->_block_bio_size>=64 && mix_arithmetic &&
+      lj_types<=max_bio_shared_types)
    shared_types=true;
  _lj_types=lj_types;

  // Allocate a host write buffer for data initialization
  int h_size=lj_types*lj_types;
-  int max_bio_shared_types=this->device->max_bio_shared_types();
  if (h_size<max_bio_shared_types)
    h_size=max_bio_shared_types;
  UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
@ -84,8 +85,10 @@ int CHARMMLongT::init(const int ntypes,
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
                         host_lj3,host_lj4);

-  ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
+  if (shared_types) {
+    ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
+    this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
+  }

  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@ -32,7 +32,7 @@ texture<int2> q_tex;
 __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
                            const __global numtyp4 *restrict lj1,
                            const int lj_types, 
-                            const __global numtyp *restrict sp_lj_in,
+                            const __global numtyp *restrict sp_lj,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
                            __global acctyp4 *restrict ans,
@ -47,16 +47,6 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

-  __local numtyp sp_lj[8];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
-  sp_lj[4]=sp_lj_in[4];
-  sp_lj[5]=sp_lj_in[5];
-  sp_lj[6]=sp_lj_in[6];
-  sp_lj[7]=sp_lj_in[7];
-
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
@ -66,18 +56,18 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
@ -196,18 +186,18 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_charmm_long_ext.cpp
+++ b/lib/gpu/lal_charmm_long_ext.cpp
@ -53,7 +53,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -72,9 +72,9 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@ -55,19 +55,19 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -219,20 +219,20 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_colloid_ext.cpp
+++ b/lib/gpu/lal_colloid_ext.cpp
@ -51,7 +51,7 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -70,9 +70,9 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_coul.cu
+++ b/lib/gpu/lal_coul.cu
@ -60,20 +60,19 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    numtyp factor_coul;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {

-      numtyp factor_coul;
+      int j=dev_packed[nbor];
      factor_coul = sp_cl[sbmask(j)];
      j &= NEIGHMASK;

@ -152,21 +151,21 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    numtyp factor_coul;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {

-      numtyp factor_coul = sp_cl[sbmask(j)];
+      int j=dev_packed[nbor];
+      factor_coul = sp_cl[sbmask(j)];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
--- a/lib/gpu/lal_coul_debye.cu
+++ b/lib/gpu/lal_coul_debye.cu
@ -30,19 +30,19 @@ texture<int2> q_tex;
 #endif

 __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
-                         const __global numtyp *restrict scale,
-                         const int lj_types, 
-                         const __global numtyp *restrict sp_cl_in,
-                         const __global int *dev_nbor, 
-                         const __global int *dev_packed, 
-                         __global acctyp4 *restrict ans,
-                         __global acctyp *restrict engv,
-                         const int eflag, const int vflag, const int inum,
-                         const int nbor_pitch,
-                         const __global numtyp *restrict q_ ,
-                         const __global numtyp *restrict cutsq, 
-                         const numtyp qqrd2e, const numtyp kappa,
-                         const int t_per_atom) {
+                           const __global numtyp *restrict scale,
+                           const int lj_types, 
+                           const __global numtyp *restrict sp_cl_in,
+                           const __global int *dev_nbor, 
+                           const __global int *dev_packed, 
+                           __global acctyp4 *restrict ans,
+                           __global acctyp *restrict engv,
+                           const int eflag, const int vflag, const int inum,
+                           const int nbor_pitch,
+                           const __global numtyp *restrict q_ ,
+                           const __global numtyp *restrict cutsq, 
+                           const numtyp qqrd2e, const numtyp kappa,
+                           const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -61,20 +61,20 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
-
-      numtyp factor_coul = sp_cl[sbmask(j)];
+    numtyp factor_coul;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+  
+      int j=dev_packed[nbor];
+      factor_coul = sp_cl[sbmask(j)];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -158,21 +158,21 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    numtyp factor_coul;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {

-      numtyp factor_coul = sp_cl[sbmask(j)];
+      int j=dev_packed[nbor];
+      factor_coul = sp_cl[sbmask(j)];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@ -62,11 +62,11 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
@ -77,8 +77,8 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
      e_coul += (acctyp)2.0*e_self;
    }

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_coul, r, prefactor, erfcc;
      factor_coul = sp_lj[sbmask(j)];
@ -163,11 +163,11 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
@ -178,8 +178,8 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
      e_coul += (acctyp)2.0*e_self;
    }
 
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_coul, r, prefactor, erfcc;
      factor_coul = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_coul_dsf_ext.cpp
+++ b/lib/gpu/lal_coul_dsf_ext.cpp
@ -49,7 +49,7 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -66,9 +66,9 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@ -124,7 +124,8 @@ texture<int2> q_tex;
 #endif

 __kernel void k_coul_long(const __global numtyp4 *restrict x_, 
-                          const __global numtyp *restrict scale,
+                          const __global numtyp4 *restrict lj1,
+                          const __global numtyp4 *restrict lj3, 
                          const int lj_types,
                          const __global numtyp *restrict sp_cl_in, 
                          const __global int *dev_nbor,
@ -153,44 +154,41 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    int itype=ix.w;
    numtyp qtmp; fetch(qtmp,i,q_tex);

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_coul;
      factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int jtype=jx.w;
-      
+
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
-      
-      int mtype=itype*lj_types+jtype;
+
      if (rsq < cut_coulsq) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp force, prefactor, _erfc;
-        
+
        numtyp r = ucl_rsqrt(r2inv);
        numtyp grij = g_ewald * r;
        numtyp expm2 = ucl_exp(-grij*grij);
        numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
        _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
        fetch(prefactor,j,q_tex);
-        prefactor *= qqrd2e * scale[mtype] * qtmp/r;
+        prefactor *= qqrd2e * qtmp/r;
        force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;

        f.x+=delx*force;
@ -198,7 +196,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
        f.z+=delz*force;

        if (eflag>0) {
-	        e_coul += prefactor*(_erfc-factor_coul);
+	  e_coul += prefactor*(_erfc-factor_coul);
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
@ -217,7 +215,8 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
 }

 __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, 
-                               const __global numtyp *restrict scale_in,
+                               const __global numtyp4 *restrict lj1_in,
+                               const __global numtyp4 *restrict lj3_in,
                               const __global numtyp *restrict sp_cl_in,
                               const __global int *dev_nbor, 
                               const __global int *dev_packed,
@ -231,14 +230,10 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

-  __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_cl[4];
  if (tid<4)
    sp_cl[tid]=sp_cl_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    scale[tid]=scale_in[tid];
-  }
-  
+
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@ -249,27 +244,24 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
  __syncthreads();

  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
-    int iw = ix.w;
-    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-    
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_coul;
      factor_coul = (numtyp)1.0-sp_cl[sbmask(j)];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int mtype=itype+jx.w;
-      
+
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
@ -280,13 +272,13 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
        numtyp r2inv=ucl_recip(rsq);
        numtyp force, prefactor, _erfc;

-        numtyp r = ucl_sqrt(rsq);
+        numtyp r = ucl_rsqrt(r2inv);
        numtyp grij = g_ewald * r;
        numtyp expm2 = ucl_exp(-grij*grij);
        numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
        _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
        fetch(prefactor,j,q_tex);
-        prefactor *= qqrd2e * scale[mtype] * qtmp/r;
+        prefactor *= qqrd2e * qtmp/r;
        force = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul) * r2inv;

        f.x+=delx*force;
@ -294,7 +286,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
        f.z+=delz*force;

        if (eflag>0) {
-	        e_coul += prefactor*(_erfc-factor_coul);
+	  e_coul += prefactor*(_erfc-factor_coul);
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
--- a/lib/gpu/lal_coul_long_ext.cpp
+++ b/lib/gpu/lal_coul_long_ext.cpp
@ -48,7 +48,7 @@ int cl_gpu_init(const int ntypes, double **host_scale,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -65,9 +65,9 @@ int cl_gpu_init(const int ntypes, double **host_scale,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -48,7 +48,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
                         const int last_gpu, const int gpu_mode, 
                         const double p_split, const int nthreads, 
                         const int t_per_atom, const double cell_size,
-                         char *ocl_vendor) {
+                         char *ocl_vendor, const int block_pair) {
  _nthreads=nthreads;
  #ifdef _OPENMP
  omp_set_num_threads(nthreads);
@ -66,6 +66,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
  _gpu_mode=gpu_mode;
  _particle_split=p_split;
  _cell_size=cell_size;
+  _block_pair=block_pair;

  // Get the rank/size within the world
  MPI_Comm_rank(_comm_world,&_world_me);
@ -175,6 +176,12 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
  } else if (s_vendor=="cypress") {    
    _ocl_vendor_name="AMD Cypress";
    _ocl_vendor_string="-DCYPRESS_OCL";
+  } else if (s_vendor=="phi") {    
+    _ocl_vendor_name="Intel Phi";
+    _ocl_vendor_string="-DPHI_OCL";
+  } else if (s_vendor=="intel") {    
+    _ocl_vendor_name="Intel CPU";
+    _ocl_vendor_string="-DINTEL_OCL";
  } else if (s_vendor=="generic") {    
    _ocl_vendor_name="GENERIC";
    _ocl_vendor_string="-DGENERIC_OCL";
@ -352,10 +359,10 @@ void DeviceT::init_message(FILE *screen, const char *name,
    for (int i=first_gpu; i<last; i++) {
      std::string sname;
      if (i==first_gpu)
-        sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
+        sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+
              toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+" GHZ (";
      else              
-        sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
+        sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+
              toa(gpu->clock_rate(i))+" GHZ (";
      if (sizeof(PRECISION)==4) {
        if (sizeof(ACC_PRECISION)==4)
@ -520,7 +527,7 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
    if (screen && times[5]>0.0) {
      fprintf(screen,"\n\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"    Device Time Info (average): ");
+      fprintf(screen,"      Device Time Info (average): ");
      fprintf(screen,"\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");

@ -582,7 +589,7 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
    if (screen && times[6]>0.0) {
      fprintf(screen,"\n\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"      Device Time Info (average): ");
+      fprintf(screen,"    Device Time Info (average): ");
      fprintf(screen,"\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");

@ -672,7 +679,7 @@ int DeviceT::compile_kernels() {
    _threads_per_charge=gpu_lib_data[13];
  _pppm_max_spline=gpu_lib_data[4];
  _pppm_block=gpu_lib_data[5];
-  _block_pair=gpu_lib_data[6];
+  if (_block_pair == -1) _block_pair=gpu_lib_data[6];
  _max_shared_types=gpu_lib_data[7];
  _block_cell_2d=gpu_lib_data[8];
  _block_cell_id=gpu_lib_data[9];
@ -714,10 +721,10 @@ int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
                    const int last_gpu, const int gpu_mode, 
                    const double particle_split, const int nthreads,
                    const int t_per_atom, const double cell_size, 
-                    char *opencl_vendor) {
+                    char *opencl_vendor, const int block_pair) {
  return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
                                   particle_split,nthreads,t_per_atom, 
-                                   cell_size,opencl_vendor);
+                                   cell_size,opencl_vendor,block_pair);
 }

 void lmp_clear_device() {
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -51,7 +51,7 @@ class Device {
                  const int last_gpu, const int gpu_mode, 
                  const double particle_split, const int nthreads,
                  const int t_per_atom, const double cell_size, 
-                  char *vendor_string);
+                  char *vendor_string, const int block_pair);

  /// Initialize the device for Atom and Neighbor storage
  /** \param rot True if quaternions need to be stored
@ -96,12 +96,12 @@ class Device {
                    const int first_gpu, const int last_gpu);

  /// Perform charge assignment asynchronously for PPPM
-	void set_single_precompute(PPPM<numtyp,acctyp,
-	                                         float,_lgpu_float4> *pppm);
+  void set_single_precompute(PPPM<numtyp,acctyp,
+                             float,_lgpu_float4> *pppm);

  /// Perform charge assignment asynchronously for PPPM
-	void set_double_precompute(PPPM<numtyp,acctyp,
-	                                         double,_lgpu_double4> *pppm);
+  void set_double_precompute(PPPM<numtyp,acctyp,
+                             double,_lgpu_double4> *pppm);

  /// Esimate the overhead from GPU calls from multiple procs
  /** \param kernel_calls Number of kernel calls/timestep for timing estimated
--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@ -73,17 +73,17 @@ texture<int4,1> mu_tex;
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
    if (eflag>0) {                                                          \
-      *engv=energy*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
-      *engv=e_coul*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
+      engv[ei]=energy*(acctyp)0.5;                                             \
+      ei+=inum;                                                           \
+      engv[ei]=e_coul*(acctyp)0.5;                                             \
+      ei+=inum;                                                           \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                        \
+        ei+=inum;                                                         \
      }                                                                     \
    }                                                                       \
    ans[ii]=f;                                                              \
@ -113,17 +113,17 @@ texture<int4,1> mu_tex;
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
    if (eflag>0) {                                                          \
-      *engv=energy*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
-      *engv=e_coul*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
+      engv[ei]=energy*(acctyp)0.5;                                             \
+      ei+=inum;                                                           \
+      engv[ei]=e_coul*(acctyp)0.5;                                             \
+      ei+=inum;                                                           \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                        \
+        ei+=inum;                                                         \
      }                                                                     \
    }                                                                       \
    ans[ii]=f;                                                              \
@ -173,19 +173,19 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
@ -385,11 +385,11 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
@ -397,8 +397,8 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_dipole_lj_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_ext.cpp
@ -50,7 +50,7 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -68,9 +68,9 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@ -74,17 +74,17 @@ texture<int4,1> mu_tex;
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
    if (eflag>0) {                                                          \
-      *engv=energy*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
-      *engv=e_coul*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
+      engv[ei]=energy*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
+      engv[ei]=e_coul*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+        ei+=inum;                                                           \
      }                                                                     \
    }                                                                       \
    ans[ii]=f;                                                              \
@ -114,17 +114,17 @@ texture<int4,1> mu_tex;
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
    if (eflag>0) {                                                          \
-      *engv=energy*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
-      *engv=e_coul*(acctyp)0.5;                                             \
-      engv+=inum;                                                           \
+      engv[ei]=energy*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
+      engv[ei]=e_coul*(acctyp)0.5;                                          \
+      ei+=inum;                                                             \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+        ei+=inum;                                                           \
      }                                                                     \
    }                                                                       \
    ans[ii]=f;                                                              \
@ -174,19 +174,19 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
@ -418,11 +418,11 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
@ -430,8 +430,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_dipole_lj_sf_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_sf_ext.cpp
@ -50,7 +50,7 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -68,9 +68,9 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_dpd.cu
+++ b/lib/gpu/lal_dpd.cu
@ -187,11 +187,10 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
@ -199,9 +198,9 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
    int itag=iv.w;

    numtyp factor_dpd;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_dpd = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -308,11 +307,10 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
@ -321,9 +319,9 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
    int itag=iv.w;

    numtyp factor_dpd;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_dpd = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@ -115,14 +115,15 @@ texture<int4> z2r_sp2_tex;
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
+    int ei=ii;                                                              \
    if (eflag>0) {                                                          \
-      engv[ii]+=energy*(acctyp)0.5;                                         \
-      engv+=inum;                                                           \
+      engv[ei]+=energy*(acctyp)0.5;                                         \
+      ei+=inum;                                                             \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        engv[ii]=virial[i]*(acctyp)0.5;                                     \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+        ei+=inum;                                                           \
      }                                                                     \
    }                                                                       \
    ans[ii]=f;                                                              \
@ -171,15 +172,15 @@ texture<int4> z2r_sp2_tex;
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
-    engv+=ii;                                                               \
+    int ei=ii;                                                              \
    if (eflag>0) {                                                          \
-      *engv+=energy*(acctyp)0.5;                                            \
-      engv+=inum;                                                           \
+      engv[ei]+=energy*(acctyp)0.5;                                         \
+      ei+=inum;                                                             \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i]*(acctyp)0.5;                                        \
-        engv+=inum;                                                         \
+        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+        ei+=inum;                                                           \
      }                                                                     \
    }                                                                       \
    ans[ii]=f;                                                              \
@ -209,17 +210,17 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
  acctyp energy = (acctyp)0;
   
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -286,17 +287,17 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
  __syncthreads(); 

  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -354,18 +355,18 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp ifp; fetch(ifp,i,fp_tex);  //fp_[i];
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -468,19 +469,19 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
  __syncthreads();

  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp ifp; fetch(ifp,i,fp_tex); //fp_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
--- a/lib/gpu/lal_eam_ext.cpp
+++ b/lib/gpu/lal_eam_ext.cpp
@ -58,7 +58,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -77,9 +77,9 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@ -31,14 +31,13 @@ texture<int4,1> pos_tex, quat_tex;
 #endif

 #define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset,           \
-                    i, numj, stride, list_end, nbor)                         \
-    nbor=nbor_mem+ii;                                                        \
-    i=*nbor;                                                                 \
-    nbor+=nbor_stride;                                                       \
-    numj=*nbor;                                                              \
-    nbor+=nbor_stride;                                                       \
-    list_end=nbor+fast_mul(nbor_stride,numj);                                \
-    nbor+=fast_mul(offset,nbor_stride);                                      \
+                    i, numj, stride, nbor_end, nbor_begin)                   \
+    i=nbor_mem[ii];                                                          \
+    nbor_begin=ii+nbor_stride;                                               \
+    numj=nbor_mem[nbor_begin];                                               \
+    nbor_begin+=nbor_stride;                                                 \
+    nbor_end=nbor_begin+fast_mul(nbor_stride,numj);                          \
+    nbor_begin+=fast_mul(offset,nbor_stride);                                \
    stride=fast_mul(t_per_atom,nbor_stride);

 #if (ARCH < 300)
--- a/lib/gpu/lal_ellipsoid_nbor.cu
+++ b/lib/gpu/lal_ellipsoid_nbor.cu
@ -41,20 +41,19 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
  int ii=GLOBAL_ID_X+start;

  if (ii<inum) {
-    const __global int *nbor=dev_ij+ii;
-    int i=*nbor;
+    int i=dev_ij[ii];
+    int nbor=ii+nbor_pitch;
+    int numj=dev_ij[nbor];
    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
-    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
+    int nbor_end=nbor+fast_mul(numj,nbor_pitch);
+    int packed=ii+nbor_pitch+nbor_pitch;
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul(iw,ntypes);
    int newj=0;  
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      int j=dev_ij[nbor];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
@ -70,7 +69,7 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
        rsq+=t*t;

        if (rsq<cf.x) {
-          *packed=j;
+          dev_nbor[packed]=j;
          packed+=nbor_pitch;
          newj++;
        }
@ -105,21 +104,20 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
  __syncthreads();

  if (ii<inum) {
-    const __global int *nbor=dev_ij+ii;
-    int i=*nbor;
+    int i=dev_ij[ii];
+    int nbor=ii+nbor_pitch;
+    int numj=dev_ij[nbor];
    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
-    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
+    int nbor_end=nbor+fast_mul(numj,nbor_pitch);
+    int packed=ii+nbor_pitch+nbor_pitch;
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    int newj=0;  
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      int j=dev_ij[nbor];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
@ -135,7 +133,7 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
        rsq+=t*t;

        if (rsq<cutsq[mtype]) {
-          *packed=j;
+          dev_nbor[packed]=j;
          packed+=nbor_pitch;
          newj++;
        }
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@ -51,19 +51,19 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -138,20 +138,20 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_gauss_ext.cpp
+++ b/lib/gpu/lal_gauss_ext.cpp
@ -48,7 +48,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -65,9 +65,9 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@ -119,7 +119,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    const __global int *nbor, *nbor_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
@ -140,7 +140,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-      int j=*nbor;
+      int j=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_gayberne_ext.cpp
+++ b/lib/gpu/lal_gayberne_ext.cpp
@ -52,7 +52,7 @@ int gb_gpu_init(const int ntypes, const double gamma,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -71,9 +71,9 @@ int gb_gpu_init(const int ntypes, const double gamma,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@ -53,7 +53,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    const __global int *nbor, *nbor_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
@ -68,7 +68,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -276,19 +276,19 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
-                n_stride,list_end,nbor);
+                n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex);
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_ij[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -371,20 +371,20 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
-                n_stride,list_end,nbor);
+                n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_ij[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@ -28,7 +28,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
                   const __global numtyp4 *restrict lj1,
                   const __global numtyp4 *restrict lj3, 
                   const int lj_types, 
-                   const __global numtyp *restrict sp_lj_in, 
+                   const __global numtyp *restrict sp_lj, 
                   const __global int * dev_nbor, 
                   const __global int * dev_packed, 
                   __global acctyp4 *restrict ans, 
@ -38,12 +38,6 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

-  __local numtyp sp_lj[4];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
-
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@ -52,19 +46,18 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -142,20 +135,19 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@ -52,19 +52,19 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -143,20 +143,20 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_lj96_ext.cpp
+++ b/lib/gpu/lal_lj96_ext.cpp
@ -48,7 +48,7 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -65,9 +65,9 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@ -65,18 +65,18 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
@ -187,19 +187,19 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_class2_long_ext.cpp
+++ b/lib/gpu/lal_lj_class2_long_ext.cpp
@ -51,7 +51,7 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -69,9 +69,9 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@ -65,18 +65,18 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
@ -178,19 +178,19 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@ -66,18 +66,18 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
@ -186,19 +186,19 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_coul_debye_ext.cpp
+++ b/lib/gpu/lal_lj_coul_debye_ext.cpp
@ -51,7 +51,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -69,9 +69,9 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_lj_coul_ext.cpp
+++ b/lib/gpu/lal_lj_coul_ext.cpp
@ -50,7 +50,7 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -68,9 +68,9 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_lj_coul_long.cpp
+++ b/lib/gpu/lal_lj_coul_long.cpp
@ -80,7 +80,7 @@ int LJCoulLongT::init(const int ntypes,

  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq, host_cut_ljsq);
+	   host_cutsq, host_cut_ljsq);

  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
@ -102,6 +102,23 @@ int LJCoulLongT::init(const int ntypes,
  return 0;
 }

+template <class numtyp, class acctyp>
+void LJCoulLongT::reinit(const int ntypes, double **host_cutsq, double **host_lj1,
+                         double **host_lj2, double **host_lj3, double **host_lj4,
+                         double **host_offset, double **host_cut_ljsq) {
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  
+  for (int i=0; i<_lj_types*_lj_types; i++)
+    host_write[i]=0.0;
+  
+  this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2,
+                         host_cutsq, host_cut_ljsq);
+  this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4,
+                         host_offset);
+}
+
 template <class numtyp, class acctyp>
 void LJCoulLongT::clear() {
  if (!_allocated)
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@ -65,18 +65,18 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
@ -183,19 +183,19 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_coul_long.h
+++ b/lib/gpu/lal_lj_coul_long.h
@ -46,6 +46,11 @@ class LJCoulLong : public BaseCharge<numtyp, acctyp> {
           const double host_cut_coulsq, double *host_special_coul,
           const double qqrd2e, const double g_ewald);

+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **host_cutsq,
+              double **host_lj1, double **host_lj2, double **host_lj3,
+              double **host_lj4, double **host_offset, double **host_cut_ljsq);
+
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
--- a/lib/gpu/lal_lj_coul_long_ext.cpp
+++ b/lib/gpu/lal_lj_coul_long_ext.cpp
@ -51,7 +51,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -69,9 +69,9 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
@ -93,6 +93,29 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
  return init_ok;
 }

+// ---------------------------------------------------------------------------
+// Copy updated coeffs from host to device
+// ---------------------------------------------------------------------------
+void ljcl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
+                    double **host_lj2, double **host_lj3, double **host_lj4,
+                    double **offset, double **host_cut_ljsq) {
+  int world_me=LJCLMF.device->world_me();
+  int gpu_rank=LJCLMF.device->gpu_rank();
+  int procs_per_gpu=LJCLMF.device->procs_per_gpu();
+  
+  if (world_me==0)
+    LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, 
+                  offset, host_cut_ljsq);
+  LJCLMF.device->world_barrier();
+  
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, 
+                    offset, host_cut_ljsq);
+    LJCLMF.device->gpu_barrier();
+  }
+}
+
 void ljcl_gpu_clear() {
  LJCLMF.clear();
 }
--- a/lib/gpu/lal_lj_coul_msm.cu
+++ b/lib/gpu/lal_lj_coul_msm.cu
@ -118,11 +118,11 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
@ -130,8 +130,8 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
    
    numtyp cut_coul = ucl_sqrt(cut_coulsq);
    
-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
@ -239,11 +239,11 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
@ -252,8 +252,8 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,

    numtyp cut_coul = ucl_sqrt(cut_coulsq);

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_coul_msm_ext.cpp
+++ b/lib/gpu/lal_lj_coul_msm_ext.cpp
@ -51,7 +51,7 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -70,9 +70,9 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@ -68,11 +68,11 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
@ -84,8 +84,8 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
      e_coul += (acctyp)2.0*e_self;
    }

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul, r, prefactor, erfcc;
      factor_lj = sp_lj[sbmask(j)];
@ -195,11 +195,11 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
@ -212,8 +212,8 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
      e_coul += (acctyp)2.0*e_self;
    }

-    for ( ; nbor<list_end; nbor+=n_stride) {
-      int j=*nbor;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul, r, prefactor, erfcc;
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_dsf_ext.cpp
+++ b/lib/gpu/lal_lj_dsf_ext.cpp
@ -52,7 +52,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -71,9 +71,9 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@ -54,19 +54,19 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -147,20 +147,20 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_lj_expand_ext.cpp
+++ b/lib/gpu/lal_lj_expand_ext.cpp
@ -49,7 +49,7 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -66,9 +66,9 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_lj_ext.cpp
+++ b/lib/gpu/lal_lj_ext.cpp
@ -48,7 +48,7 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -65,9 +65,9 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_lj_gromacs.cu
+++ b/lib/gpu/lal_lj_gromacs.cu
@ -55,19 +55,18 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {

-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -161,20 +160,19 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
-    int i, numj;
+    int i, numj, nbor, nbor_end;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {

-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_mie.cu
+++ b/lib/gpu/lal_mie.cu
@ -52,19 +52,19 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -143,20 +143,20 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_mie_ext.cpp
+++ b/lib/gpu/lal_mie_ext.cpp
@ -50,7 +50,7 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -68,9 +68,9 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@ -54,19 +54,19 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -145,20 +145,20 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_morse_ext.cpp
+++ b/lib/gpu/lal_morse_ext.cpp
@ -49,7 +49,7 @@ int mor_gpu_init(const int ntypes, double **cutsq,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -66,9 +66,9 @@ int mor_gpu_init(const int ntypes, double **cutsq,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@ -65,6 +65,11 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
  else
    _alloc_packed=false;

+  if (pre_cut)
+    _packed_permissions=UCL_READ_WRITE;
+  else
+    _packed_permissions=UCL_READ_ONLY;
+
  bool success=true;
    
  // Initialize timers for the selected GPU
@ -121,7 +126,7 @@ void Neighbor::alloc(bool &success) {
  if (_alloc_packed) {
    dev_packed.clear();
    success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
-                                         UCL_READ_ONLY)==UCL_SUCCESS);
+                                         _packed_permissions)==UCL_SUCCESS);
    _c_bytes+=dev_packed.row_bytes();                                         
  } 
  if (_max_host>0) {
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@ -227,6 +227,7 @@ class Neighbor {
  int _gpu_nbor, _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
  bool _gpu_host, _alloc_packed;
  double _cutoff, _cell_size, _bin_time;
+  enum UCL_MEMOPT _packed_permissions;

  double _gpu_bytes, _c_bytes, _cell_bytes;
  void alloc(bool &success);
--- a/lib/gpu/lal_neighbor_cpu.cu
+++ b/lib/gpu/lal_neighbor_cpu.cu
@ -25,17 +25,17 @@ __kernel void kernel_unpack(__global int *dev_nbor,
  int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;

  if (ii<inum) {
-    __global int *nbor=dev_nbor+ii+inum;
-    int numj=*nbor;
+    int nbor=ii+inum;
+    int numj=dev_nbor[nbor];
    nbor+=inum;
-    const __global int *list=dev_ij+*nbor;
-    const __global int *list_end=list+numj;
+    int list=dev_nbor[nbor];
+    int list_end=list+numj;
    list+=offset;
    nbor+=fast_mul(ii,t_per_atom-1)+offset;
    int stride=fast_mul(t_per_atom,inum);
      
    for ( ; list<list_end; list++) {
-      *nbor=*list;
+      dev_nbor[nbor]=dev_ij[list];
      nbor+=stride;
    }
  } // if ii
--- a/lib/gpu/lal_pppm.cpp
+++ b/lib/gpu/lal_pppm.cpp
@ -160,7 +160,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,

  // Allocate error flags for checking out of bounds atoms
  success=success && (error_flag.alloc(1,*ucl_device,UCL_READ_ONLY,
-                                       UCL_WRITE_ONLY)==UCL_SUCCESS);
+                                       UCL_READ_WRITE)==UCL_SUCCESS);
  if (!success) {
    flag=-3;
    return 0;
--- a/lib/gpu/lal_pppm_ext.cpp
+++ b/lib/gpu/lal_pppm_ext.cpp
@ -51,7 +51,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -69,9 +69,9 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -333,6 +333,54 @@ inline double shfl_xor(double var, int laneMask, int width) {

 #endif

+// -------------------------------------------------------------------------
+//                           INTEL CPU OPENCL DEFINITIONS
+// -------------------------------------------------------------------------
+
+#ifdef INTEL_OCL
+
+#define USE_OPENCL
+#define MEM_THREADS 16
+#define THREADS_PER_ATOM 1
+#define THREADS_PER_CHARGE 1
+#define BLOCK_PAIR 1
+#define MAX_SHARED_TYPES 0
+#define BLOCK_NBOR_BUILD 4
+#define BLOCK_BIO_PAIR 2
+#define BLOCK_ELLIPSE 2
+
+#define WARP_SIZE 1
+#define PPPM_BLOCK_1D 32
+#define BLOCK_CELL_2D 1
+#define BLOCK_CELL_ID 2
+#define MAX_BIO_SHARED_TYPES 0
+
+#endif
+
+// -------------------------------------------------------------------------
+//                           INTEL PHI OPENCL DEFINITIONS
+// -------------------------------------------------------------------------
+
+#ifdef PHI_OCL
+
+#define USE_OPENCL
+#define MEM_THREADS 16
+#define THREADS_PER_ATOM 1
+#define THREADS_PER_CHARGE 1
+#define BLOCK_PAIR 16
+#define MAX_SHARED_TYPES 0
+#define BLOCK_NBOR_BUILD 16
+#define BLOCK_BIO_PAIR 16
+#define BLOCK_ELLIPSE 16
+
+#define WARP_SIZE 1
+#define PPPM_BLOCK_1D 32
+#define BLOCK_CELL_2D 4
+#define BLOCK_CELL_ID 16
+#define MAX_BIO_SHARED_TYPES 0
+
+#endif
+
 // -------------------------------------------------------------------------
 //                            GENERIC OPENCL DEFINITIONS
 // -------------------------------------------------------------------------
@ -433,7 +481,9 @@ inline double shfl_xor(double var, int laneMask, int width) {
 //                  ARCHITECTURE INDEPENDENT DEFINITIONS
 // -------------------------------------------------------------------------

+#ifndef PPPM_MAX_SPLINE
 #define PPPM_MAX_SPLINE 8
+#endif

 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@ -74,7 +74,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    const __global int *nbor, *nbor_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
@ -125,7 +125,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-      int j=*nbor;
+      int j=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_re_squared_ext.cpp
+++ b/lib/gpu/lal_re_squared_ext.cpp
@ -50,7 +50,7 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -68,9 +68,9 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@ -172,7 +172,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    const __global int *nbor, *nbor_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
@ -199,7 +199,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-      int j=*nbor;
+      int j=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -424,7 +424,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    const __global int *nbor, *nbor_end;
+    int nbor, nbor_end;
    int j, numj;
    __local int n_stride;
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
@ -435,7 +435,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-      int i=*nbor;
+      int i=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(i)];
      i &= NEIGHMASK;

@ -616,19 +616,19 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
-                n_stride,list_end,nbor);
+                n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex);
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_ij[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -710,20 +710,20 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
-                n_stride,list_end,nbor);
+                n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_ij[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_soft.cu
+++ b/lib/gpu/lal_soft.cu
@ -53,19 +53,19 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -141,20 +141,20 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_soft_ext.cpp
+++ b/lib/gpu/lal_soft_ext.cpp
@ -48,7 +48,7 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -65,9 +65,9 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@ -156,18 +156,18 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
  __syncthreads();

  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    itype=map[itype];
    
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -195,17 +195,17 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
        numtyp sw_cut=sw3_ijparam.x;
        numtyp sw_cutsq=sw3_ijparam.y;
        numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
-            ucl_powr(sw_sigma,sw_powerp);
+            pow(sw_sigma,sw_powerp);
        numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
-            ucl_powr(sw_sigma,sw_powerq);
+            pow(sw_sigma,sw_powerq);
        numtyp pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
-            ucl_powr(sw_sigma,sw_powerp+(numtyp)1.0);
+            pow(sw_sigma,sw_powerp+(numtyp)1.0);
        numtyp pre_sw_c4=sw_biga*sw_epsilon*
-            ucl_powr(sw_sigma,sw_powerq+(numtyp)1.0);
+            pow(sw_sigma,sw_powerq+(numtyp)1.0);
        numtyp pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
-            ucl_powr(sw_sigma,sw_powerp);
+            pow(sw_sigma,sw_powerp);
        numtyp pre_sw_c6=sw_biga*sw_epsilon*
-            ucl_powr(sw_sigma,sw_powerq);
+            pow(sw_sigma,sw_powerq);

        numtyp r=ucl_sqrt(rsq);
        numtyp rp=ucl_powr(r,-sw_powerp);
@ -343,6 +343,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
                                const int t_per_atom, const int evatom) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;

@ -359,21 +360,20 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor_j, *list_end;
-    int i, numj;
+    int i, numj, nbor_j, nbor_end;

    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,list_end,nbor_j);
+              n_stride,nbor_end,nbor_j);
    int offset_k=tid & (t_per_atom-1);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w; 
    itype=map[itype];

-    for ( ; nbor_j<list_end; nbor_j+=n_stride) {
+    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
  
-      int j=*nbor_j;
+      int j=dev_packed[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -392,15 +392,17 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
      if (rsq1 > sw3_ijparam.y) continue;

      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
+      sw_sigma=sw1_ijparam.y;
+      sw_gamma=sw1_ijparam.w;
      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
      sw_cut_ij=sw3_ijparam.x;

-      const __global int *nbor_k=nbor_j-offset_j+offset_k;
+      int nbor_k=nbor_j-offset_j+offset_k;
      if (nbor_k<=nbor_j)
        nbor_k+=n_stride;

-      for ( ; nbor_k<list_end; nbor_k+=n_stride) {
-        int k=*nbor_k;
+      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
+        int k=dev_packed[nbor_k];
        k &= NEIGHMASK;

        numtyp4 kx; fetch4(kx,k,pos_tex);
@ -415,11 +417,15 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
        numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
        if (rsq2 < sw3_ikparam.y) {   // sw_cutsq=sw3[ikparam].y;
          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
+          sw_sigma=sw1_ikparam.y;
+          sw_gamma=sw1_ikparam.w;
          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
          sw_cut_ik=sw3_ikparam.x;

          int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
+          sw_epsilon=sw1_ijkparam.x;
+          sw_lambda=sw1_ijkparam.z;
          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
@ -466,6 +472,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
                             const int t_per_atom) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;

@ -482,20 +489,19 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor_j, *list_end, *k_end;
-    int i, numj;
+    int i, numj, nbor_j, nbor_end, k_end;

    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,list_end,nbor_j);
+              n_stride,nbor_end,nbor_j);
    int offset_k=tid & (t_per_atom-1);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    itype=map[itype];

-    for ( ; nbor_j<list_end; nbor_j+=n_stride) {
-      int j=*nbor_j;
+    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
+      int j=dev_packed[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -513,27 +519,27 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,

      if (rsq1 > sw3_ijparam.y) continue;

-      int jiparam=elem2param[jtype*nelements*nelements+itype*nelements+itype]; 
-      numtyp4 sw1_jiparam; fetch4(sw1_jiparam,jiparam,sw1_tex);
-      numtyp4 sw3_jiparam; fetch4(sw3_jiparam,jiparam,sw3_tex);
-      sw_sigma_gamma_ij=sw1_jiparam.y*sw1_jiparam.w; //sw_sigma*sw_gamma;
-      sw_cut_ij=sw3_jiparam.x;
+      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
+      sw_sigma=sw1_ijparam.y;
+      sw_gamma=sw1_ijparam.w;
+      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
+      sw_cut_ij=sw3_ijparam.x;

-      const __global int *nbor_k=dev_nbor+j+nbor_pitch;
-      int numk=*nbor_k;
+      int nbor_k=j+nbor_pitch;
+      int numk=dev_nbor[nbor_k]; 
      if (dev_nbor==dev_packed) {
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
        nbor_k+=nbor_pitch;
-        nbor_k=dev_packed+*nbor_k;
+        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
        nbor_k+=offset_k;
      }

      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=*nbor_k;
+        int k=dev_packed[nbor_k];
        k &= NEIGHMASK;

        if (k == i) continue;
@ -541,25 +547,29 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
        numtyp4 kx; fetch4(kx,k,pos_tex);
        int ktype=kx.w;
        ktype=map[ktype];
-        int jkparam=elem2param[jtype*nelements*nelements+ktype*nelements+ktype]; 
+        int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype]; 

        numtyp delr2x = kx.x - jx.x;
        numtyp delr2y = kx.y - jx.y;
        numtyp delr2z = kx.z - jx.z;
        numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
-        numtyp4 sw3_jkparam; fetch4(sw3_jkparam,jkparam,sw3_tex);
+        numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);

-        if (rsq2 < sw3_jkparam.y) {
-          numtyp4 sw1_jkparam; fetch4(sw1_jkparam,jkparam,sw1_tex);
-          sw_sigma_gamma_ik=sw1_jkparam.y*sw1_jkparam.w; //sw_sigma*sw_gamma;
-          sw_cut_ik=sw3_jkparam.x;
+        if (rsq2 < sw3_ikparam.y) {
+          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
+          sw_sigma=sw1_ikparam.y;
+          sw_gamma=sw1_ikparam.w;
+          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
+          sw_cut_ik=sw3_ikparam.x;

-          int jikparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype];
-          numtyp4 sw1_jikparam; fetch4(sw1_jikparam,jikparam,sw1_tex);
-          sw_lambda_epsilon_ijk=sw1_jikparam.x*sw1_jikparam.z; //sw_lambda*sw_epsilon;
+          int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
+          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
+          sw_epsilon=sw1_ijkparam.x;
+          sw_lambda=sw1_ijkparam.z;
+          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
-          numtyp4 sw3_jikparam; fetch4(sw3_jikparam,jikparam,sw3_tex);
-          sw_costheta_ijk=sw3_jikparam.z;
+          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
+          sw_costheta_ijk=sw3_ijkparam.z;

          numtyp fjx, fjy, fjz;
          //if (evatom==0) {
@ -602,6 +612,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
                             const int t_per_atom) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;

@ -618,20 +629,19 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor_j, *list_end, *k_end;
-    int i, numj;
+    int i, numj, nbor_j, nbor_end, k_end;

    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,list_end,nbor_j);
+              n_stride,nbor_end,nbor_j);
    int offset_k=tid & (t_per_atom-1);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    itype=map[itype];

-    for ( ; nbor_j<list_end; nbor_j+=n_stride) {
-      int j=*nbor_j;
+    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
+      int j=dev_packed[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -649,27 +659,27 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,

      if (rsq1 > sw3_ijparam.y) continue;

-      int jiparam=elem2param[jtype*nelements*nelements+itype*nelements+itype]; 
-      numtyp4 sw1_jiparam; fetch4(sw1_jiparam,jiparam,sw1_tex);
-      numtyp4 sw3_jiparam; fetch4(sw3_jiparam,jiparam,sw3_tex);
-      sw_sigma_gamma_ij=sw1_jiparam.y*sw1_jiparam.w; //sw_sigma*sw_gamma;
-      sw_cut_ij=sw3_jiparam.x;
-       
-      const __global int *nbor_k=dev_nbor+j+nbor_pitch;
-      int numk=*nbor_k;
+      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
+      sw_sigma=sw1_ijparam.y;
+      sw_gamma=sw1_ijparam.w;
+      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
+      sw_cut_ij=sw3_ijparam.x;
+        
+      int nbor_k=j+nbor_pitch;
+      int numk=dev_nbor[nbor_k]; 
      if (dev_nbor==dev_packed) {
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
        nbor_k+=nbor_pitch;
-        nbor_k=dev_packed+*nbor_k;
+        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
        nbor_k+=offset_k;
      }

      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=*nbor_k;
+        int k=dev_packed[nbor_k];
        k &= NEIGHMASK;

        if (k == i) continue;
@ -677,25 +687,29 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
        numtyp4 kx; fetch4(kx,k,pos_tex);
        int ktype=kx.w;
        ktype=map[ktype];
-        int jkparam=elem2param[jtype*nelements*nelements+ktype*nelements+ktype]; 
-        numtyp4 sw3_jkparam; fetch4(sw3_jkparam,jkparam,sw3_tex);
+        int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype]; 
+        numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);

        numtyp delr2x = kx.x - jx.x;
        numtyp delr2y = kx.y - jx.y;
        numtyp delr2z = kx.z - jx.z;
        numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;

-        if (rsq2 < sw3_jkparam.y) {
-          numtyp4 sw1_jkparam; fetch4(sw1_jkparam,jkparam,sw1_tex);
-          sw_sigma_gamma_ik=sw1_jkparam.y*sw1_jkparam.w; //sw_sigma*sw_gamma;
-          sw_cut_ik=sw3_jkparam.x;
+        if (rsq2 < sw3_ikparam.y) {
+          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
+          sw_sigma=sw1_ikparam.y;
+          sw_gamma=sw1_ikparam.w;
+          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
+          sw_cut_ik=sw3_ikparam.x;

-          int jikparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype];
-          numtyp4 sw1_jikparam; fetch4(sw1_jikparam,jikparam,sw1_tex);
-          sw_lambda_epsilon_ijk=sw1_jikparam.x*sw1_jikparam.z; //sw_lambda*sw_epsilon;
+          int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
+          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
+          sw_epsilon=sw1_ijkparam.x;
+          sw_lambda=sw1_ijkparam.z;
+          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
-          numtyp4 sw3_jikparam; fetch4(sw3_jikparam,jikparam,sw3_tex);
-          sw_costheta_ijk=sw3_jikparam.z;
+          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
+          sw_costheta_ijk=sw3_ijkparam.z;

          numtyp fjx, fjy, fjz, fkx, fky, fkz;
          threebody(delr1x,delr1y,delr1z,eflag,energy);
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@ -73,19 +73,19 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
  int tlm1 = tablength - 1;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    
    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -171,20 +171,20 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
  int tlm1 = tablength - 1;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    
    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -269,19 +269,19 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
  int tlm1 = tablength - 1;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    
    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -371,20 +371,20 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
  int tlm1 = tablength - 1;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    
    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -473,19 +473,19 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
  int tlm1 = tablength - 1;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    
    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -582,20 +582,20 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
  int tlm1 = tablength - 1;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    
    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -693,19 +693,19 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
  int tlm1 = tablength - 1;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    
    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -800,20 +800,20 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
  int tlm1 = tablength - 1;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    
    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_table_ext.cpp
+++ b/lib/gpu/lal_table_ext.cpp
@ -48,7 +48,7 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -65,9 +65,9 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@ -51,19 +51,19 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -140,20 +140,20 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@ -58,20 +58,20 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp radi; fetch(radi,i,rad_tex);
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

@ -150,11 +150,11 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
  __syncthreads();
  
  if (ii<inum) {
-    const __global int *nbor, *list_end;
+    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,list_end,nbor);
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp radi; fetch(radi,i,rad_tex);
@ -162,9 +162,9 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
-      int j=*nbor;
+      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_yukawa_colloid_ext.cpp
+++ b/lib/gpu/lal_yukawa_colloid_ext.cpp
@ -48,7 +48,7 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -65,9 +65,9 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
--- a/lib/gpu/lal_yukawa_ext.cpp
+++ b/lib/gpu/lal_yukawa_ext.cpp
@ -48,7 +48,7 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
    message=true;

  if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
    fflush(screen);
  }

@ -65,9 +65,9 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
      else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }