git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8810 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2012-09-21 15:57:23 +00:00 · 2012-09-21 15:57:23 +00:00 · 209999068c
parent 8bcf139b73
commit 209999068c
53 changed files with 1733 additions and 1081 deletions
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -87,7 +87,7 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
       $(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam_cubin.h \
       $(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck_cubin.h \
       $(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long_cubin.h \
-       $(OBJ_DIR)/buck_coul_wolf.cubin $(OBJ_DIR)/buck_coul_wolf_cubin.h \
+       $(OBJ_DIR)/buck_coul.cubin $(OBJ_DIR)/buck_coul_cubin.h \
       $(OBJ_DIR)/table.cubin $(OBJ_DIR)/table_cubin.h \
       $(OBJ_DIR)/yukawa.cubin $(OBJ_DIR)/yukawa_cubin.h \
       $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@ -64,16 +64,11 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
       $(OBJ_DIR)/eam_cl.h $(OBJ_DIR)/buck_cl.h \
       $(OBJ_DIR)/buck_coul_cl.h $(OBJ_DIR)/buck_coul_long_cl.h \
       $(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h \
-       $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \
-       $(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf_cubin.h \
-       $(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long_cubin.h \
-       $(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj_cubin.h \
-       $(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf_cubin.h \
-       $(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid_cubin.h \
-       $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
-       $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
-       $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
-       $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h
+       $(OBJ_DIR)/born_cl.h $(OBJ_DIR)/born_coul_wolf_cl.h \
+       $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/dipole_lj_cl.h \
+       $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/colloid_cl.h \
+       $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \
+       $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h


 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@ -131,11 +126,11 @@ $(OBJ_DIR)/lal_pppm_ext.o: $(ALL_H) lal_pppm.h lal_pppm_ext.cpp
 $(OBJ_DIR)/ellipsoid_nbor_cl.h: lal_ellipsoid_nbor.cu lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor lal_preprocessor.h lal_ellipsoid_nbor.cu $(OBJ_DIR)/ellipsoid_nbor_cl.h

-$(OBJ_DIR)/gayberne_cl.h: lal_gayberne.cu lal_ellipsoid_extra.h lal_preprocessor.h
-	$(BSH) ./geryon/file_to_cstr.sh gayberne lal_preprocessor.h lal_ellipsoid_extra.h lal_gayberne.cu $(OBJ_DIR)/gayberne_cl.h;
+$(OBJ_DIR)/gayberne_cl.h: lal_gayberne.cu lal_ellipsoid_extra.h lal_aux_fun1.h lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh gayberne lal_preprocessor.h lal_aux_fun1.h lal_ellipsoid_extra.h lal_gayberne.cu $(OBJ_DIR)/gayberne_cl.h;

-$(OBJ_DIR)/gayberne_lj_cl.h: lal_gayberne_lj.cu lal_ellipsoid_extra.h lal_preprocessor.h
-	$(BSH) ./geryon/file_to_cstr.sh gayberne_lj lal_preprocessor.h lal_ellipsoid_extra.h lal_gayberne_lj.cu $(OBJ_DIR)/gayberne_lj_cl.h;
+$(OBJ_DIR)/gayberne_lj_cl.h: lal_gayberne_lj.cu lal_ellipsoid_extra.h lal_aux_fun1.h lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh gayberne_lj lal_preprocessor.h lal_aux_fun1.h lal_ellipsoid_extra.h lal_gayberne_lj.cu $(OBJ_DIR)/gayberne_lj_cl.h;

 $(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_cl.h $(OBJ_DIR)/gayberne_lj_cl.h $(OBJ_DIR)/lal_base_ellipsoid.o
 	$(OCL) -o $@ -c lal_gayberne.cpp -I$(OBJ_DIR)
@ -143,11 +138,11 @@ $(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/g
 $(OBJ_DIR)/lal_gayberne_ext.o: $(ALL_H) $(OBJ_DIR)/lal_gayberne.o lal_gayberne_ext.cpp
 	$(OCL) -o $@ -c lal_gayberne_ext.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/re_squared_cl.h: lal_re_squared.cu lal_ellipsoid_extra.h lal_preprocessor.h
-	$(BSH) ./geryon/file_to_cstr.sh re_squared lal_preprocessor.h lal_ellipsoid_extra.h lal_re_squared.cu $(OBJ_DIR)/re_squared_cl.h;
+$(OBJ_DIR)/re_squared_cl.h: lal_re_squared.cu lal_ellipsoid_extra.h lal_aux_fun1.h lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh re_squared lal_preprocessor.h lal_aux_fun1.h lal_ellipsoid_extra.h lal_re_squared.cu $(OBJ_DIR)/re_squared_cl.h;

-$(OBJ_DIR)/re_squared_lj_cl.h: lal_re_squared_lj.cu lal_ellipsoid_extra.h lal_preprocessor.h
-	$(BSH) ./geryon/file_to_cstr.sh re_squared_lj lal_preprocessor.h lal_ellipsoid_extra.h lal_re_squared_lj.cu $(OBJ_DIR)/re_squared_lj_cl.h;
+$(OBJ_DIR)/re_squared_lj_cl.h: lal_re_squared_lj.cu lal_ellipsoid_extra.h lal_aux_fun1.h lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh re_squared_lj lal_preprocessor.h lal_aux_fun1.h lal_ellipsoid_extra.h lal_re_squared_lj.cu $(OBJ_DIR)/re_squared_lj_cl.h;

 $(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_cl.h $(OBJ_DIR)/re_squared_lj_cl.h $(OBJ_DIR)/lal_base_ellipsoid.o
 	$(OCL) -o $@ -c lal_re_squared.cpp -I$(OBJ_DIR)
--- a/lib/gpu/lal_atom.cu
+++ b/lib/gpu/lal_atom.cu
@ -17,8 +17,10 @@
 #include "lal_preprocessor.h"
 #endif

-__kernel void kernel_cast_x(__global numtyp4 *x_type, __global double *x,
-                            __global int *type, const int nall) {
+__kernel void kernel_cast_x(__global numtyp4 *restrict x_type, 
+                            const __global double *restrict x,
+                            const __global int *restrict type, 
+                            const int nall) {
  int ii=GLOBAL_ID_X;

  if (ii<nall) {
--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@ -41,6 +41,8 @@
    nbor+=offset;                                                            \
  }

+#if (ARCH < 300)
+
 #define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
                      eflag, vflag, ans, engv)                              \
  if (t_per_atom>1) {                                                       \
@ -137,3 +139,72 @@
    ans[ii]=f;                                                              \
  }

+#else
+
+#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
+                      eflag, vflag, ans, engv)                              \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        energy += shfl_xor(energy, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv=energy;                                                         \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv=virial[i];                                                    \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+  }
+
+#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      f.x += shfl_xor(f.x, s, t_per_atom);                                  \
+      f.y += shfl_xor(f.y, s, t_per_atom);                                  \
+      f.z += shfl_xor(f.z, s, t_per_atom);                                  \
+      energy += shfl_xor(energy, s, t_per_atom);                            \
+      e_coul += shfl_xor(e_coul, s, t_per_atom);                            \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv=energy;                                                         \
+      engv+=inum;                                                           \
+      *engv=e_coul;                                                         \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv=virial[i];                                                    \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+  }
+
+#endif
+
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@ -132,7 +132,7 @@ int * BaseAtomicT::reset_nbors(const int nall, const int inum, int *ilist,
  resize_atom(inum,nall,success);
  resize_local(inum,mn,success);
  if (!success)
-    return false;
+    return NULL;

  nbor->get_host(inum,ilist,numj,firstneigh,block_size());

--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@ -135,7 +135,7 @@ int * BaseChargeT::reset_nbors(const int nall, const int inum, int *ilist,
  resize_atom(inum,nall,success);
  resize_local(inum,mn,success);
  if (!success)
-    return false;
+    return NULL;

  nbor->get_host(inum,ilist,numj,firstneigh,block_size());

--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@ -137,7 +137,7 @@ int * BaseDipoleT::reset_nbors(const int nall, const int inum, int *ilist,
  resize_atom(inum,nall,success);
  resize_local(inum,mn,success);
  if (!success)
-    return false;
+    return NULL;

  nbor->get_host(inum,ilist,numj,firstneigh,block_size());

--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@ -80,7 +80,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
  ucl_device=device->gpu;
  atom=&device->atom;

-  _block_size=device->pair_block_size();
+  _block_size=device->block_ellipse();
  compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere);

  // Initialize host-device load balancer
@ -118,9 +118,8 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
    ans->force.zero();

  // Memory for ilist ordered by particle type
-  if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
-    return 0;
-  else return -3;
+  if (host_olist.alloc(nbor->max_atoms(),*ucl_device)!=UCL_SUCCESS)
+    return -3;

  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();

--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@ -24,14 +24,18 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif

-__kernel void k_born(__global numtyp4 *x_, __global numtyp4 *coeff1,
-                          __global numtyp4* coeff2, 
-                          __global numtyp2 *cutsq_sigma,
-                          const int lj_types, __global numtyp *sp_lj_in, 
-                          __global int *dev_nbor, __global int *dev_packed, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
+__kernel void k_born(const __global numtyp4 *restrict x_, 
+                     const __global numtyp4 *restrict coeff1,
+                     const __global numtyp4 *restrict coeff2, 
+                     const __global numtyp2 *restrict cutsq_sigma,
+                     const int lj_types, 
+                     const __global numtyp *restrict sp_lj_in, 
+                     const __global int *dev_nbor,
+                     const __global int *dev_packed, 
+                     __global acctyp4 *restrict ans,
+                     __global acctyp *restrict engv, 
+                     const int eflag, const int vflag, const int inum,
+                     const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -49,7 +53,7 @@ __kernel void k_born(__global numtyp4 *x_, __global numtyp4 *coeff1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -108,14 +112,17 @@ __kernel void k_born(__global numtyp4 *x_, __global numtyp4 *coeff1,
  } // if ii
 }

-__kernel void k_born_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
-                               __global numtyp4* coeff2_in, 
-                               __global numtyp2 *cutsq_sigma,
-                               __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, const int t_per_atom) {
+__kernel void k_born_fast(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict coeff1_in,
+                          const __global numtyp4 *restrict coeff2_in, 
+                          const __global numtyp2 *restrict cutsq_sigma,
+                          const __global numtyp *restrict sp_lj_in, 
+                          const __global int *dev_nbor, 
+                          const __global int *dev_packed, 
+                          __global acctyp4 *restrict ans, 
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum, 
+                          const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -140,7 +147,7 @@ __kernel void k_born_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@ -29,14 +29,19 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif

-__kernel void k_born_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
-                          __global numtyp4* coeff2, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_,
-                          __global numtyp4 *cutsq_sigma, 
+__kernel void k_born_long(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict coeff1,
+                          const __global numtyp4 *restrict coeff2,
+                          const int lj_types, 
+                          const __global numtyp *restrict sp_lj_in, 
+                          const __global int *dev_nbor, 
+                          const __global int *dev_packed, 
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch, 
+                          const __global numtyp *restrict q_,
+                          const __global numtyp4 *restrict cutsq_sigma, 
                          const numtyp cut_coulsq, const numtyp qqrd2e,
                          const numtyp g_ewald, const int t_per_atom) {
  int tid, ii, offset;
@ -61,7 +66,7 @@ __kernel void k_born_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -143,14 +148,18 @@ __kernel void k_born_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
  } // if ii
 }

-__kernel void k_born_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
-                               __global numtyp4* coeff2_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_born_long_fast(const __global numtyp4 *restrict x_, 
+                               const __global numtyp4 *restrict coeff1_in,
+                               const __global numtyp4 *restrict coeff2_in, 
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans, 
+                               __global acctyp *restrict engv, 
                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               __global numtyp4 *cutsq_sigma,
+                               const int nbor_pitch, 
+                               const __global numtyp *restrict q_,
+                               const __global numtyp4 *restrict cutsq_sigma,
                               const numtyp cut_coulsq, const numtyp qqrd2e,
                               const numtyp g_ewald, const int t_per_atom) {
  int tid, ii, offset;
@ -178,7 +187,7 @@ __kernel void k_born_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@ -31,14 +31,19 @@ texture<int2> q_tex;

 #define MY_PIS (acctyp)1.77245385090551602729

-__kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
-                          __global numtyp4* coeff2, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_,
-                          __global numtyp4 *cutsq_sigma, 
+__kernel void k_born_wolf(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict coeff1,
+                          const __global numtyp4 *restrict coeff2, 
+                          const int lj_types, 
+                          const __global numtyp *restrict sp_lj_in, 
+                          const __global int *dev_nbor, 
+                          const __global int *dev_packed, 
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch, 
+                          const __global numtyp *restrict q_,
+                          const __global numtyp4 *restrict cutsq_sigma, 
                          const numtyp cut_coulsq, const numtyp qqrd2e,
                          const numtyp alf, const numtyp e_shift, 
                          const numtyp f_shift, const int t_per_atom) {
@ -64,7 +69,7 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -74,7 +79,8 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
    int itype=ix.w;

    if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
+      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * 
+        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
      e_coul += (acctyp)2.0*e_self;
    }

@ -83,7 +89,7 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      factor_coul = sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -98,7 +104,7 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
      int mtype=itype*lj_types+jtype;
      if (rsq<cutsq_sigma[mtype].x) { // cutsq
        numtyp r2inv = ucl_recip(rsq);
-        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
+        numtyp forcecoul, forceborn, force, r6inv, prefactor, erfcc;
        numtyp v_sh = (numtyp)0.0;
        numtyp rexp = (numtyp)0.0;
        
@ -113,14 +119,14 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
        if (rsq < cut_coulsq) {
          numtyp r=ucl_rsqrt(r2inv);
          numtyp arij = alf * r;
-          numtyp expm2 = ucl_exp(-arij*arij);
-          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*arij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          numtyp erfcd = ucl_exp(-arij*arij);
          fetch(prefactor,j,q_tex);
          prefactor *= qqrd2e * qtmp/r;
-          v_sh = (_erfc - e_shift*r)*prefactor; 
-          numtyp dvdrr = (_erfc/rsq + EWALD_F*alf*expm2/r) + f_shift;
-          forcecoul = prefactor * (dvdrr*rsq-factor_coul);
+
+          erfcc = erfc(arij);
+          v_sh = (erfcc - e_shift*r)*prefactor;
+          numtyp dvdrr = (erfcc/rsq + (numtyp)2.0*alf/MY_PIS * erfcd/r) + f_shift;
+          forcecoul = prefactor * dvdrr*rsq*factor_coul;
        } else forcecoul = (numtyp)0.0;

        force = (forceborn + forcecoul) * r2inv;
@ -131,8 +137,8 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,

        if (eflag>0) {
          if (rsq < cut_coulsq)
-            e_coul += prefactor*(v_sh-factor_coul);
-          if (rsq < coeff1[mtype].w) {
+            e_coul += v_sh*factor_coul;
+          if (rsq < cutsq_sigma[mtype].y) {
            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
              + coeff2[mtype].z*r2inv*r6inv;
            energy+=factor_lj*(e-coeff2[mtype].w);
@ -154,14 +160,18 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
  } // if ii
 }

-__kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
-                               __global numtyp4* coeff2_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, 
+                               const __global numtyp4 *restrict coeff1_in,
+                               const __global numtyp4 *restrict coeff2_in, 
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans, 
+                               __global acctyp *restrict engv, 
                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               __global numtyp4 *cutsq_sigma,
+                               const int nbor_pitch, 
+                               const __global numtyp *restrict q_,
+                               const __global numtyp4 *restrict cutsq_sigma,
                               const numtyp cut_coulsq, const numtyp qqrd2e,
                               const numtyp alf, const numtyp e_shift, 
                               const numtyp f_shift, const int t_per_atom) {
@ -190,7 +200,7 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -201,7 +211,8 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
+      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * 
+        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
      e_coul += (acctyp)2.0*e_self;
    }

@ -210,7 +221,7 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in

      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      factor_coul = sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -224,11 +235,11 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in

      if (rsq<cutsq_sigma[mtype].x) {
        numtyp r2inv=ucl_recip(rsq);
-        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
+        numtyp forcecoul, forceborn, force, r6inv, prefactor, erfcc;
        numtyp v_sh = (numtyp)0.0;
        numtyp rexp = (numtyp)0.0;
        
-        if (rsq < coeff1[mtype].w) {
+        if (rsq < cutsq_sigma[mtype].y) {
          numtyp r = ucl_sqrt(rsq);
          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
          r6inv = r2inv*r2inv*r2inv;
@ -237,16 +248,16 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
        } else forceborn = (numtyp)0.0;

        if (rsq < cut_coulsq) {
-          numtyp r=ucl_rsqrt(r2inv);
+          numtyp r=ucl_sqrt(rsq);
          numtyp arij = alf * r;
-          numtyp expm2 = ucl_exp(-arij*arij);
-          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*arij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          numtyp erfcd = ucl_exp(-arij*arij);
          fetch(prefactor,j,q_tex);
          prefactor *= qqrd2e * qtmp/r;
-          v_sh = (_erfc - e_shift*r)*prefactor; 
-          numtyp dvdrr = (_erfc/rsq + EWALD_F*alf*expm2/r) + f_shift;
-          forcecoul = prefactor * (dvdrr*rsq-factor_coul);
+
+          erfcc = erfc(arij);
+          v_sh = (erfcc - e_shift*r)*prefactor;
+          numtyp dvdrr = (erfcc/rsq + (numtyp)2.0*alf/MY_PIS * erfcd/r) + f_shift;
+          forcecoul = prefactor * dvdrr*rsq*factor_coul;
        } else forcecoul = (numtyp)0.0;

        force = (forceborn + forcecoul) * r2inv;
@ -257,8 +268,8 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in

        if (eflag>0) {
          if (rsq < cut_coulsq)
-            e_coul += prefactor*(v_sh-factor_coul);
-          if (rsq < coeff1[mtype].w) {
+            e_coul += v_sh*factor_coul;
+          if (rsq < cutsq_sigma[mtype].y) {
            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
              + coeff2[mtype].z*r2inv*r6inv;
            energy+=factor_lj*(e-coeff2[mtype].w);
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@ -24,13 +24,17 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif

-__kernel void k_buck(__global numtyp4 *x_, __global numtyp4 *coeff1,
-                          __global numtyp4* coeff2, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
+__kernel void k_buck(const __global numtyp4 *restrict x_, 
+                     const __global numtyp4 *restrict coeff1,
+                     const __global numtyp4 *restrict coeff2, 
+                     const int lj_types, 
+                     const __global numtyp *restrict sp_lj_in,
+                     const __global int *dev_nbor, 
+                     const __global int *dev_packed,
+                     __global acctyp4 *restrict ans,
+                     __global acctyp *restrict engv, 
+                     const int eflag,  const int vflag, const int inum,
+                     const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -48,7 +52,7 @@ __kernel void k_buck(__global numtyp4 *x_, __global numtyp4 *coeff1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -106,13 +110,16 @@ __kernel void k_buck(__global numtyp4 *x_, __global numtyp4 *coeff1,
  } // if ii
 }

-__kernel void k_buck_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
-                               __global numtyp4* coeff2_in, 
-                               __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, const int t_per_atom) {
+__kernel void k_buck_fast(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict coeff1_in,
+                          const __global numtyp4 *restrict coeff2_in, 
+                          const __global numtyp *restrict sp_lj_in, 
+                          const __global int *dev_nbor, 
+                          const __global int *dev_packed, 
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum, 
+                          const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -137,7 +144,7 @@ __kernel void k_buck_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@ -29,15 +29,20 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif

-__kernel void k_buck_coul(__global numtyp4 *x_, __global numtyp4 *coeff1,
-                          __global numtyp4* coeff2, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_ ,
-                          __global numtyp4 *cutsq, const numtyp qqrd2e,
-                          const int t_per_atom) {
+__kernel void k_buck_coul(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict coeff1,
+                          const __global numtyp4 *restrict coeff2, 
+                          const int lj_types, 
+                          const __global numtyp *restrict sp_lj_in, 
+                          const __global int *dev_nbor, 
+                          const __global int *dev_packed, 
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch, 
+                          const __global numtyp *restrict q_ ,
+                          const __global numtyp4 *restrict cutsq, 
+                          const numtyp qqrd2e, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -60,7 +65,7 @@ __kernel void k_buck_coul(__global numtyp4 *x_, __global numtyp4 *coeff1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -136,15 +141,19 @@ __kernel void k_buck_coul(__global numtyp4 *x_, __global numtyp4 *coeff1,
  } // if ii
 }

-__kernel void k_buck_coul_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
-                               __global numtyp4* coeff2_in, 
-                               __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_, 
+                               const __global numtyp4 *restrict coeff1_in,
+                               const __global numtyp4 *restrict coeff2_in, 
+                               const __global numtyp *restrict sp_lj_in, 
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed, 
+                               __global acctyp4 *restrict ans, 
+                               __global acctyp *restrict engv, 
                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               __global numtyp4 *_cutsq, const numtyp qqrd2e, 
-                               const int t_per_atom) {
+                               const int nbor_pitch, 
+                               const __global numtyp *restrict q_,
+                               const __global numtyp4 *restrict _cutsq, 
+                               const numtyp qqrd2e, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -172,7 +181,7 @@ __kernel void k_buck_coul_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@ -29,16 +29,21 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif

-__kernel void k_buck_coul_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
-                          __global numtyp4* coeff2, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_,
-                          __global numtyp *cutsq, 
-                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp g_ewald, const int t_per_atom) {
+__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, 
+                               const __global numtyp4 *restrict coeff1,
+                               const __global numtyp4 *restrict coeff2, 
+                               const int lj_types, 
+                               const __global numtyp *restrict sp_lj_in, 
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed, 
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv, 
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch, 
+                               const __global numtyp *restrict q_,
+                               const __global numtyp *restrict cutsq, 
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -61,7 +66,7 @@ __kernel void k_buck_coul_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -144,16 +149,21 @@ __kernel void k_buck_coul_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
  } // if ii
 }

-__kernel void k_buck_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
-                               __global numtyp4* coeff2_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               __global numtyp *cutsq,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const int t_per_atom) {
+__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_, 
+                                    const __global numtyp4 *restrict coeff1_in,
+                                    const __global numtyp4 *restrict coeff2_in, 
+                                    const __global numtyp *restrict sp_lj_in,
+                                    const __global int *dev_nbor, 
+                                    const __global int *dev_packed,
+                                    __global acctyp4 *restrict ans,
+                                    __global acctyp *restrict engv, 
+                                    const int eflag, const int vflag, 
+                                    const int inum, const int nbor_pitch,
+                                    const __global numtyp *restrict q_, 
+                                    const __global numtyp *restrict cutsq,
+                                    const numtyp cut_coulsq, 
+                                    const numtyp qqrd2e, const numtyp g_ewald, 
+                                    const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -179,7 +189,7 @@ __kernel void k_buck_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *coef
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_cg_cmm.cu
+++ b/lib/gpu/lal_cg_cmm.cu
@ -24,13 +24,17 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif

-__kernel void k_cg_cmm(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
+__kernel void k_cg_cmm(const __global numtyp4 *restrict x_, 
+                       const __global numtyp4 *restrict lj1,
+                       const __global numtyp4 *restrict lj3, 
+                       const int lj_types, 
+                       const __global numtyp *restrict sp_lj_in,
+                       const __global int *dev_nbor, 
+                       const __global int *dev_packed,
+                       __global acctyp4 *restrict ans,
+                       __global acctyp *restrict engv, 
+                       const int eflag, const int vflag, const int inum,
+                       const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -48,7 +52,7 @@ __kernel void k_cg_cmm(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -111,13 +115,16 @@ __kernel void k_cg_cmm(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void k_cg_cmm_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,__global int *dev_nbor,
-                               __global int *dev_packed, __global acctyp4 *ans,
-                               __global acctyp *engv, const int eflag,
-                               const int vflag, const int inum,
-                               const int nbor_pitch, const int t_per_atom) {
+__kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_, 
+                            const __global numtyp4 *restrict lj1_in,
+                            const __global numtyp4 *restrict lj3_in, 
+                            const __global numtyp *restrict sp_lj_in,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv,
+                            const int eflag, const int vflag, const int inum,
+                            const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -142,7 +149,7 @@ __kernel void k_cg_cmm_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_cg_cmm_long.cu
+++ b/lib/gpu/lal_cg_cmm_long.cu
@ -29,15 +29,20 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif

-__kernel void k_cg_cmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_ ,
-                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp g_ewald, const int t_per_atom) {
+__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, 
+                            const __global numtyp4 *restrict lj1,
+                            const __global numtyp4 *restrict lj3, 
+                            const int lj_types, 
+                            const __global numtyp *restrict sp_lj_in,
+                            const __global int *dev_nbor, 
+                            const __global int *dev_packed,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv,
+                            const int eflag,  const int vflag, const int inum,
+                            const int nbor_pitch,
+                            const __global numtyp *restrict q_ ,
+                            const numtyp cut_coulsq, const numtyp qqrd2e,
+                            const numtyp g_ewald, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -60,7 +65,7 @@ __kernel void k_cg_cmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -148,15 +153,19 @@ __kernel void k_cg_cmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void k_cg_cmm_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const int t_per_atom) {
+__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_, 
+                                 const __global numtyp4 *restrict lj1_in,
+                                 const __global numtyp4 *restrict lj3_in, 
+                                 const __global numtyp *restrict sp_lj_in, 
+                                 const __global int *dev_nbor, 
+                                 const __global int *dev_packed,
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv, 
+                                 const int eflag, const int vflag, 
+                                 const int inum, const int nbor_pitch,
+                                 const __global numtyp *restrict q_, 
+                                 const numtyp cut_coulsq, const numtyp qqrd2e,
+                                 const numtyp g_ewald, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -181,7 +190,7 @@ __kernel void k_cg_cmm_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@ -29,16 +29,21 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif

-__kernel void k_charmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          const int lj_types, __global numtyp *sp_lj_in,
-                          __global int *dev_nbor, __global int *dev_packed,
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nbor_pitch, __global numtyp *q_,
-                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp g_ewald, const numtyp denom_lj,
-                          const numtyp cut_bothsq, const numtyp cut_ljsq,
-                          const numtyp cut_lj_innersq, const int t_per_atom) {
+__kernel void k_charmm_long(const __global numtyp4 *restrict x_,
+                            const __global numtyp4 *restrict lj1,
+                            const int lj_types, 
+                            const __global numtyp *restrict sp_lj_in,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv, 
+                            const int eflag, const int vflag, const int inum, 
+                            const int nbor_pitch, 
+                            const __global numtyp *restrict q_,
+                            const numtyp cut_coulsq, const numtyp qqrd2e,
+                            const numtyp g_ewald, const numtyp denom_lj,
+                            const numtyp cut_bothsq, const numtyp cut_ljsq,
+                            const numtyp cut_lj_innersq, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -61,7 +66,7 @@ __kernel void k_charmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -152,17 +157,21 @@ __kernel void k_charmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void k_charmm_long_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
-                               __global int *dev_packed, __global acctyp4 *ans,
-                               __global acctyp *engv, const int eflag,
-                               const int vflag, const int inum,
-                               const int nbor_pitch, __global numtyp *q_,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const numtyp denom_lj,
-                               const numtyp cut_bothsq, const numtyp cut_ljsq, 
-                               const numtyp cut_lj_innersq,
-                               const int t_per_atom) {
+__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, 
+                                 const __global numtyp2 *restrict ljd_in,
+                                 const __global numtyp *restrict sp_lj_in, 
+                                 const __global int *dev_nbor, 
+                                 const __global int *dev_packed, 
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv, 
+                                 const int eflag, const int vflag, 
+                                 const int inum, const int nbor_pitch, 
+                                 const __global numtyp *restrict q_,
+                                 const numtyp cut_coulsq, const numtyp qqrd2e,
+                                 const numtyp g_ewald, const numtyp denom_lj,
+                                 const numtyp cut_bothsq, const numtyp cut_ljsq, 
+                                 const numtyp cut_lj_innersq,
+                                 const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -170,7 +179,8 @@ __kernel void k_charmm_long_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
  __local numtyp sp_lj[8];
  if (tid<8)
    sp_lj[tid]=sp_lj_in[tid];
-  ljd[tid]=ljd_in[tid];
+  if (tid<MAX_BIO_SHARED_TYPES)
+    ljd[tid]=ljd_in[tid];
  if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
    ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
  
@ -185,7 +195,7 @@ __kernel void k_charmm_long_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@ -24,16 +24,19 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif

-__kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
-                        __global numtyp4* lj3, const int lj_types, 
-                        __global numtyp *sp_lj_in, 
-                        __global numtyp4* colloid1, 
-                        __global numtyp4* colloid2,
-                        __global int *form, 
-                        __global int *dev_nbor, 
-                        __global int *dev_packed, __global acctyp4 *ans,
-                        __global acctyp *engv, const int eflag, 
-                        const int vflag, const int inum,
+__kernel void k_colloid(const __global numtyp4 *restrict x_, 
+                        const __global numtyp4 *restrict lj1,
+                        const __global numtyp4 *restrict lj3, 
+                        const int lj_types, 
+                        const __global numtyp *restrict sp_lj_in, 
+                        const __global numtyp4 *restrict colloid1, 
+                        const __global numtyp4 *restrict colloid2,
+                        const __global int *form, 
+                        const __global int *dev_nbor, 
+                        const __global int *dev_packed, 
+                        __global acctyp4 *restrict ans,
+                        __global acctyp *restrict engv, 
+                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
@ -52,7 +55,7 @@ __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -98,9 +101,10 @@ __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
          K[3] *= K[3]*K[3];
          K[6] = K[3]*K[3];
          fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
-          force = 4.0/15.0*fR * 
-            (2.0*(K[1]+K[2]) * (K[1]*(5.0*K[1]+22.0*K[2])+5.0*K[4]) * 
-          colloid2[mtype].w/K[6]-5.0) / K[0];
+          force = (numtyp)4.0/(numtyp)15.0*fR * 
+             ((numtyp)2.0*(K[1]+K[2]) * 
+             (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) * 
+             colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
          force*=factor_lj;
        } else if (form[mtype]==2) { // LARGE_LARGE
          r = ucl_sqrt(rsq);
@ -131,8 +135,9 @@ __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
          fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
          evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
          numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
-          numtyp dUA = -colloid1[mtype].x/3.0*r*(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
-            ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
+          numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*
+                       (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
+                       ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
          force = factor_lj * (dUR+dUA)/r;
        }
  
@ -149,7 +154,8 @@ __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
              ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) +  
              (numtyp)4.2*K[4])+K[2]*K[4]) * colloid2[mtype].w/K[6]);
          } else if (form[mtype]==2) {
-            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
+            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * 
+              ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
          } 
          energy+=factor_lj*(e-lj3[mtype].z); 
        }
@ -169,15 +175,17 @@ __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void k_colloid_fast(__global numtyp4 *x_, 
-                             __global numtyp4 *lj1_in,
-                             __global numtyp4 *lj3_in, 
-                             __global numtyp *sp_lj_in,
-                             __global numtyp4 *colloid1_in, 
-                             __global numtyp4 *colloid2_in,
-                             __global int *form_in, 
-                             __global int *dev_nbor, __global int *dev_packed, 
-                             __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_colloid_fast(const __global numtyp4 *restrict x_, 
+                             const __global numtyp4 *restrict lj1_in,
+                             const __global numtyp4 *restrict lj3_in, 
+                             const __global numtyp *restrict sp_lj_in,
+                             const __global numtyp4 *restrict colloid1_in, 
+                             const __global numtyp4 *restrict colloid2_in,
+                             const __global int *form_in, 
+                             const __global int *dev_nbor, 
+                             const __global int *dev_packed, 
+                             __global acctyp4 *restrict ans, 
+                             __global acctyp *restrict engv, 
                             const int eflag, const int vflag, const int inum, 
                             const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
@ -210,7 +218,7 @@ __kernel void k_colloid_fast(__global numtyp4 *x_,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -257,7 +265,8 @@ __kernel void k_colloid_fast(__global numtyp4 *x_,
          K[6] = K[3]*K[3];
          fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
          force = (numtyp)4.0/(numtyp)15.0*fR * 
-            ((numtyp)2.0*(K[1]+K[2]) * (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) * 
+            ((numtyp)2.0*(K[1]+K[2]) * 
+            (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) * 
            colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
          force*=factor_lj;
        } else if (form[mtype]==2) { // LARGE_LARGE
@ -289,7 +298,8 @@ __kernel void k_colloid_fast(__global numtyp4 *x_,
          fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
          evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
          numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
-          numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
+          numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*
+            (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
            ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
          force = factor_lj * (dUR+dUA)/r;
        } else force = (numtyp)0.0;
@ -304,10 +314,12 @@ __kernel void k_colloid_fast(__global numtyp4 *x_,
            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
          } else if (form[mtype]==1) {
            e=(numtyp)2.0/(numtyp)9.0*fR * 
-              ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])*
-             colloid2[mtype].w/K[6]);
+              ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+
+              (numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])*
+              colloid2[mtype].w/K[6]);
          } else if (form[mtype]==2) {
-            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
+            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * 
+              ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
          } 
          energy+=factor_lj*(e-lj3[mtype].z); 
        }
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@ -31,12 +31,16 @@ texture<int2> q_tex;

 #define MY_PIS (acctyp)1.77245385090551602729

-__kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types, 
-                         __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                         __global int *dev_packed, __global acctyp4 *ans,
-                         __global acctyp *engv, const int eflag,
-                         const int vflag, const int inum,
-                         const int nbor_pitch, __global numtyp *q_ ,
+__kernel void k_coul_dsf(const __global numtyp4 *restrict x_, 
+                         const int lj_types, 
+                         const __global numtyp *restrict sp_lj_in, 
+                         const __global int *dev_nbor, 
+                         const __global int *dev_packed, 
+                         __global acctyp4 *restrict ans,
+                         __global acctyp *restrict engv, 
+                         const int eflag, const int vflag, const int inum,
+                         const int nbor_pitch, 
+                         const __global numtyp *restrict q_ ,
                         const numtyp cut_coulsq, const numtyp qqrd2e,
                         const numtyp e_shift, const numtyp f_shift, 
                         const numtyp alpha, const int t_per_atom) {
@ -58,7 +62,7 @@ __kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -66,6 +70,12 @@ __kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types,
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);

+    if (eflag>0) {
+      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * 
+        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
+      e_coul += (acctyp)2.0*e_self;
+    }
+
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;

@ -91,8 +101,8 @@ __kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types,
        numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
        numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
        erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
-        forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd + 
-                                 rsq*f_shift);
+        forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + 
+          rsq*f_shift);
        
        force = forcecoul * r2inv;

@ -122,11 +132,15 @@ __kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types,
  } // if ii
 }

-__kernel void k_coul_dsf_fast(__global numtyp4 *x_, __global numtyp* sp_lj_in,
-                              __global int *dev_nbor, __global int *dev_packed,
-                              __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, 
+                              const __global numtyp *restrict sp_lj_in,
+                              const __global int *dev_nbor, 
+                              const __global int *dev_packed,
+                              __global acctyp4 *restrict ans, 
+                              __global acctyp *restrict engv, 
                              const int eflag, const int vflag, const int inum, 
-                              const int nbor_pitch, __global numtyp *q_,
+                              const int nbor_pitch, 
+                              const __global numtyp *restrict q_,
                              const numtyp cut_coulsq, const numtyp qqrd2e,
                              const numtyp e_shift, const numtyp f_shift, 
                              const numtyp alpha, const int t_per_atom) {
@ -148,13 +162,19 @@ __kernel void k_coul_dsf_fast(__global numtyp4 *x_, __global numtyp* sp_lj_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
+
+    if (eflag>0) {
+      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * 
+        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
+      e_coul += (acctyp)2.0*e_self;
+    }
 
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
@ -181,8 +201,8 @@ __kernel void k_coul_dsf_fast(__global numtyp4 *x_, __global numtyp* sp_lj_in,
        numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
        numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
        erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
-        forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd + 
-                                   rsq*f_shift);
+        forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + 
+          rsq*f_shift);
        
        force = forcecoul * r2inv;

--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@ -29,13 +29,112 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif

-__kernel void k_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types,
-                          __global numtyp *sp_cl_in, __global int *dev_nbor,
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_,
+#if (ARCH < 300)
+
+#define store_answers_lq(f, e_coul, virial, ii, inum, tid,                  \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
+                                                                            \
+    red_acc[0][tid]=f.x;                                                    \
+    red_acc[1][tid]=f.y;                                                    \
+    red_acc[2][tid]=f.z;                                                    \
+    red_acc[3][tid]=e_coul;                                                 \
+                                                                            \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      if (offset < s) {                                                     \
+        for (int r=0; r<4; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+                                                                            \
+    f.x=red_acc[0][tid];                                                    \
+    f.y=red_acc[1][tid];                                                    \
+    f.z=red_acc[2][tid];                                                    \
+    e_coul=red_acc[3][tid];                                                 \
+                                                                            \
+    if (vflag>0) {                                                          \
+      for (int r=0; r<6; r++)                                               \
+        red_acc[r][tid]=virial[r];                                          \
+                                                                            \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+        if (offset < s) {                                                   \
+          for (int r=0; r<6; r++)                                           \
+            red_acc[r][tid] += red_acc[r][tid+s];                           \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      for (int r=0; r<6; r++)                                               \
+        virial[r]=red_acc[r][tid];                                          \
+    }                                                                       \
+  }                                                                         \
+                                                                            \
+  if (offset==0) {                                                          \
+    __global acctyp *ap1=engv+ii;                                           \
+    if (eflag>0) {                                                          \
+      *ap1=(acctyp)0;                                                       \
+      ap1+=inum;                                                            \
+      *ap1=e_coul;                                                          \
+      ap1+=inum;                                                            \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *ap1=virial[i];                                                     \
+        ap1+=inum;                                                          \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+  }
+
+#else
+
+#define store_answers_lq(f, e_coul, virial, ii, inum, tid,                  \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    __global acctyp *ap1=engv+ii;                                           \
+    if (eflag>0) {                                                          \
+      *ap1=(acctyp)0;                                                       \
+      ap1+=inum;                                                            \
+      *ap1=e_coul;                                                          \
+      ap1+=inum;                                                            \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *ap1=virial[i];                                                     \
+        ap1+=inum;                                                          \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+  }
+
+#endif
+
+__kernel void k_coul_long(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict lj1,
+                          const __global numtyp4 *restrict lj3, 
+                          const int lj_types,
+                          const __global numtyp *restrict sp_cl_in, 
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed, 
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch, 
+                          const __global numtyp *restrict q_,
                          const numtyp cut_coulsq, const numtyp qqrd2e,
                          const numtyp g_ewald, const int t_per_atom) {
  int tid, ii, offset;
@ -55,7 +154,7 @@ __kernel void k_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -109,71 +208,22 @@ __kernel void k_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
      }

    } // for nbor
-
-    // Reduce answers
-    if (t_per_atom>1) {
-      __local acctyp red_acc[6][BLOCK_PAIR];
-
-      red_acc[0][tid]=f.x;
-      red_acc[1][tid]=f.y;
-      red_acc[2][tid]=f.z;
-      red_acc[3][tid]=e_coul;
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<4; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-
-      f.x=red_acc[0][tid];
-      f.y=red_acc[1][tid];
-      f.z=red_acc[2][tid];
-      e_coul=red_acc[3][tid];
-
-      if (vflag>0) {
-        for (int r=0; r<6; r++)
-          red_acc[r][tid]=virial[r];
-
-        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-          if (offset < s) {
-            for (int r=0; r<6; r++)
-              red_acc[r][tid] += red_acc[r][tid+s];
-          }
-        }
-
-        for (int r=0; r<6; r++)
-          virial[r]=red_acc[r][tid];
-      }
-    }
-
-    // Store answers
-    if (offset==0) {
-      __global acctyp *ap1=engv+ii;
-      if (eflag>0) {
-        *ap1=(acctyp)0;
-        ap1+=inum;
-        *ap1=e_coul;
-        ap1+=inum;
-      }
-      if (vflag>0) {
-        for (int i=0; i<6; i++) {
-          *ap1=virial[i];
-          ap1+=inum;
-        }
-      }
-      ans[ii]=f;
-    }
+    store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                     vflag,ans,engv);
  } // if ii
 }

-__kernel void k_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in,
-                               __global numtyp* sp_cl_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv,
+__kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, 
+                               const __global numtyp4 *restrict lj1_in,
+                               const __global numtyp4 *restrict lj3_in,
+                               const __global numtyp *restrict sp_cl_in,
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans, 
+                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
-                               const int nbor_pitch, __global numtyp *q_,
+                               const int nbor_pitch, 
+                               const __global numtyp *restrict q_,
                               const numtyp cut_coulsq, const numtyp qqrd2e,
                               const numtyp g_ewald, const int t_per_atom) {
  int tid, ii, offset;
@ -193,7 +243,7 @@ __kernel void k_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  __syncthreads();

  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -247,61 +297,8 @@ __kernel void k_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      }

    } // for nbor
-
-    // Reduce answers
-    if (t_per_atom>1) {
-      __local acctyp red_acc[6][BLOCK_PAIR];
-
-      red_acc[0][tid]=f.x;
-      red_acc[1][tid]=f.y;
-      red_acc[2][tid]=f.z;
-      red_acc[3][tid]=e_coul;
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<4; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-
-      f.x=red_acc[0][tid];
-      f.y=red_acc[1][tid];
-      f.z=red_acc[2][tid];
-      e_coul=red_acc[3][tid];
-
-      if (vflag>0) {
-        for (int r=0; r<6; r++)
-          red_acc[r][tid]=virial[r];
-
-        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-          if (offset < s) {
-            for (int r=0; r<6; r++)
-              red_acc[r][tid] += red_acc[r][tid+s];
-          }
-        }
-
-        for (int r=0; r<6; r++)
-          virial[r]=red_acc[r][tid];
-      }
-    }
-
-    // Store answers
-    if (offset==0) {
-      __global acctyp *ap1=engv+ii;
-      if (eflag>0) {
-        *ap1=(acctyp)0;
-        ap1+=inum;
-        *ap1=e_coul;
-        ap1+=inum;
-      }
-      if (vflag>0) {
-        for (int i=0; i<6; i++) {
-          *ap1=virial[i];
-          ap1+=inum;
-        }
-      }
-      ans[ii]=f;
-    }
+    store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                     vflag,ans,engv);
  } // if ii
 }

--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -580,14 +580,14 @@ int DeviceT::compile_kernels() {
  k_info.set_function(*dev_program,"kernel_info");
  _compiled=true;

-  UCL_Vector<int,int> gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
+  UCL_Vector<int,int> gpu_lib_data(15,*gpu,UCL_NOT_PINNED);
  k_info.set_size(1,1);
  k_info.run(&gpu_lib_data);
  gpu_lib_data.update_host(false);
  
  _ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0;
  #ifndef USE_OPENCL
-  if (_ptx_arch>gpu->arch())
+  if (_ptx_arch>gpu->arch() || floor(_ptx_arch)<floor(gpu->arch()))
    return -4;
  #endif

@ -606,6 +606,7 @@ int DeviceT::compile_kernels() {
  _block_nbor_build=gpu_lib_data[10];
  _block_bio_pair=gpu_lib_data[11];
  _max_bio_shared_types=gpu_lib_data[12];
+  _block_ellipse=gpu_lib_data[14];

  if (static_cast<size_t>(_block_pair)>gpu->group_size())
    _block_pair=gpu->group_size();
--- a/lib/gpu/lal_device.cu
+++ b/lib/gpu/lal_device.cu
@ -17,7 +17,8 @@
 #include "lal_preprocessor.h"
 #endif

-__kernel void kernel_zero(__global int *mem, int numel) {
+__kernel void kernel_zero(__global int *restrict mem, 
+                          int numel) {
  int ii=GLOBAL_ID_X;
  
  if (ii<numel)
@ -39,4 +40,5 @@ __kernel void kernel_info(__global int *info) {
  info[11]=BLOCK_BIO_PAIR;
  info[12]=MAX_BIO_SHARED_TYPES;
  info[13]=THREADS_PER_CHARGE;
+  info[14]=BLOCK_ELLIPSE;
 }
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -228,6 +228,8 @@ class Device {
  inline int block_nbor_build() const { return _block_nbor_build; }
  /// Return the block size for "bio" pair styles
  inline int block_bio_pair() const { return _block_bio_pair; }
+  /// Return the block size for "ellipse" pair styles
+  inline int block_ellipse() const { return _block_ellipse; }
  /// Return the maximum number of atom types for shared mem with "bio" styles
  inline int max_bio_shared_types() const { return _max_bio_shared_types; }
  /// Architecture gpu code compiled for (returns 0 for OpenCL)
@ -292,7 +294,7 @@ class Device {

  int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
  int _pppm_max_spline, _pppm_block;
-  int _block_pair, _max_shared_types;
+  int _block_pair, _block_ellipse, _max_shared_types;
  int _block_cell_2d, _block_cell_id, _block_nbor_build;
  int _block_bio_pair, _max_bio_shared_types;

--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@ -15,6 +15,23 @@

 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+texture<float4> mu_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+texture<int4,1> mu_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#define mu_tex mu_
+#endif
+
+#if (ARCH < 300)

 #define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid,      \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
@ -73,32 +90,63 @@
    ans[ii+inum]=tor;                                                       \
  }

-#ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
-texture<float4> mu_tex;
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
-texture<int4,1> mu_tex;
+
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
+        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
+        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
+        energy += shfl_xor(energy, s, t_per_atom);                          \
+        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv=energy;                                                         \
+      engv+=inum;                                                           \
+      *engv=e_coul;                                                         \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv=virial[i];                                                    \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+    ans[ii+inum]=tor;                                                       \
+  }
+
 #endif

-#else
-#define pos_tex x_
-#define q_tex q_
-#define mu_tex mu_
-#endif
-
-__kernel void k_dipole_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_ ,
-                           __global numtyp4 *mu_,
-                          __global numtyp *cutsq, const numtyp qqrd2e,
-                          const int t_per_atom) {
+__kernel void k_dipole_lj(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict lj1,
+                          const __global numtyp4 *restrict lj3, 
+                          const int lj_types, 
+                          const __global numtyp *restrict sp_lj_in, 
+                          const __global int *dev_nbor, 
+                          const __global int *dev_packed, 
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch, 
+                          const __global numtyp *restrict q_,
+                          const __global numtyp4 *restrict mu_,
+                          const __global numtyp *restrict cutsq,
+                          const numtyp qqrd2e, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -125,7 +173,7 @@ __kernel void k_dipole_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -291,16 +339,20 @@ __kernel void k_dipole_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void k_dipole_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, 
+                               const __global numtyp4 *restrict lj1_in,
+                               const __global numtyp4 *restrict lj3_in, 
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans, 
+                               __global acctyp *restrict engv, 
                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               __global numtyp4 *mu_,
-                               __global numtyp *_cutsq, const numtyp qqrd2e,
-                               const int t_per_atom) {
+                               const int nbor_pitch, 
+                               const __global numtyp *restrict q_,
+                               const __global numtyp4 *restrict mu_,
+                               const __global numtyp *restrict _cutsq, 
+                               const numtyp qqrd2e, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -332,7 +384,7 @@ __kernel void k_dipole_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@ -16,6 +16,24 @@
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"

+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+texture<float4> mu_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+texture<int4,1> mu_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#define mu_tex mu_
+#endif
+
+#if (ARCH < 300)
+
 #define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid,      \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
  if (t_per_atom>1) {                                                       \
@ -73,32 +91,63 @@
    ans[ii+inum]=tor;                                                       \
  }

-#ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
-texture<float4> mu_tex;
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
-texture<int4,1> mu_tex;
+
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
+        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
+        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
+        energy += shfl_xor(energy, s, t_per_atom);                          \
+        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv=energy;                                                         \
+      engv+=inum;                                                           \
+      *engv=e_coul;                                                         \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv=virial[i];                                                    \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+    ans[ii+inum]=tor;                                                       \
+  }
+
 #endif

-#else
-#define pos_tex x_
-#define q_tex q_
-#define mu_tex mu_
-#endif
-
-__kernel void k_dipole_lj_sf(__global numtyp4 *x_, __global numtyp4 *lj1,
-                             __global numtyp4* lj3, const int lj_types, 
-                             __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                             __global int *dev_packed, __global acctyp4 *ans,
-                             __global acctyp *engv, const int eflag,
-                             const int vflag, const int inum,
-                             const int nbor_pitch, __global numtyp *q_ ,
-                             __global numtyp4 *mu_,
-                             __global numtyp *cutsq, const numtyp qqrd2e,
-                             const int t_per_atom) {
+__kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, 
+                             const __global numtyp4 *restrict lj1,
+                             const __global numtyp4 *restrict lj3, 
+                             const int lj_types, 
+                             const __global numtyp *restrict sp_lj_in, 
+                             const __global int *dev_nbor, 
+                             const __global int *dev_packed, 
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv, 
+                             const int eflag, const int vflag, const int inum,
+                             const int nbor_pitch, 
+                             const __global numtyp *restrict q_ ,
+                             const __global numtyp4 *restrict mu_,
+                             const __global numtyp *restrict cutsq, 
+                             const numtyp qqrd2e, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -125,7 +174,7 @@ __kernel void k_dipole_lj_sf(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -322,15 +371,20 @@ __kernel void k_dipole_lj_sf(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void k_dipole_lj_sf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                                  __global numtyp4* lj3_in, 
-                                  __global numtyp* sp_lj_in,
-                                  __global int *dev_nbor, __global int *dev_packed,
-                                  __global acctyp4 *ans, __global acctyp *engv, 
-                                  const int eflag, const int vflag, const int inum, 
-                                  const int nbor_pitch, __global numtyp *q_,
-                                  __global numtyp4 *mu_,
-                                  __global numtyp *_cutsq, const numtyp qqrd2e,
+__kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, 
+                                  const __global numtyp4 *restrict lj1_in,
+                                  const __global numtyp4 *restrict lj3_in, 
+                                  const __global numtyp *restrict sp_lj_in,
+                                  const __global int *dev_nbor, 
+                                  const __global int *dev_packed,
+                                  __global acctyp4 *restrict ans, 
+                                  __global acctyp *restrict engv, 
+                                  const int eflag, const int vflag, 
+                                  const int inum, const int nbor_pitch, 
+                                  const __global numtyp *restrict q_,
+                                  const __global numtyp4 *restrict mu_,
+                                  const __global numtyp *restrict _cutsq, 
+                                  const numtyp qqrd2e,
                                  const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
@ -363,7 +417,7 @@ __kernel void k_dipole_lj_sf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@ -46,10 +46,11 @@ template <class numtyp, class acctyp>
 int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
               int **host_type2z2r, int *host_type2frho,
               double ***host_rhor_spline, double ***host_z2r_spline,
-               double ***host_frho_spline, double rdr, double rdrho, int nrhor,
-               int nrho, int nz2r, int nfrho, int nr, const int nlocal,
-               const int nall, const int max_nbors, const int maxspecial,
-               const double cell_size, const double gpu_split, FILE *_screen) 
+               double ***host_frho_spline, double rdr, double rdrho, 
+               double rhomax, int nrhor, int nrho, int nz2r, int nfrho, int nr,
+               const int nlocal, const int nall, const int max_nbors,
+               const int maxspecial, const double cell_size, 
+               const double gpu_split, FILE *_screen) 
 {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
@ -97,6 +98,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
  _cutforcesq=host_cutforcesq;
  _rdr=rdr;
  _rdrho = rdrho;
+  _rhomax=rhomax;
  _nrhor=nrhor;
  _nrho=nrho;
  _nz2r=nz2r;
@ -468,15 +470,15 @@ void EAMT::loop(const bool _eflag, const bool _vflag) {
                            &this->nbor->dev_nbor,  &this->_nbor_data->begin(), 
                            &_fp, &this->ans->engv, &eflag, &ainum,
                            &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho,
-                            &_nrho, &_nr, &this->_threads_per_atom);
+                            &_rhomax, &_nrho, &_nr, &this->_threads_per_atom);
  } else {
    this->k_energy.set_size(GX,BX);
    this->k_energy.run(&this->atom->x, &type2rhor_z2r, &type2frho,
                       &rhor_spline2, &frho_spline1, &frho_spline2, 
                       &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp, 
                       &this->ans->engv,&eflag, &ainum, &nbor_pitch,
-                       &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_nrho, &_nr,
-                       &this->_threads_per_atom);
+                       &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_rhomax, &_nrho,
+                       &_nr, &this->_threads_per_atom);
  }

  this->time_pair.stop();
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@ -52,8 +52,10 @@ texture<int4> z2r_sp2_tex;
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MAX(A,B) ((A) > (B) ? (A) : (B))

+#if (ARCH < 300)
+
 #define store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,           \
-                        eflag,vflag,engv,rdrho,nrho,i)                      \
+                        eflag,vflag,engv,rdrho,nrho,i,rhomax)               \
  if (t_per_atom>1) {                                                       \
    __local acctyp red_acc[BLOCK_PAIR];                                     \
    red_acc[tid]=rho;                                                       \
@ -76,10 +78,11 @@ texture<int4> z2r_sp2_tex;
    if (eflag>0) {                                                          \
      fetch4(coeff,index,frho_sp2_tex);                                     \
      energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;             \
+      if (rho > rhomax) energy += fp*(rho-rhomax);                          \
      engv[ii]=(acctyp)2.0*energy;                                          \
    }                                                                       \
  }
-
+  
 #define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom,     \
                      offset, elag, vflag, ans, engv)                       \
  if (t_per_atom>1) {                                                       \
@ -125,18 +128,80 @@ texture<int4> z2r_sp2_tex;
    ans[ii]=f;                                                              \
  }

-__kernel void k_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
-                            __global int *type2frho, 
-                            __global numtyp4 *rhor_spline2, 
-                            __global numtyp4 *frho_spline1,
-                            __global numtyp4 *frho_spline2,
-                            __global int *dev_nbor, __global int *dev_packed,
-                            __global numtyp *fp_, __global acctyp *engv, 
-                            const int eflag, const int inum, 
-                            const int nbor_pitch, const int ntypes, 
-                            const numtyp cutforcesq, const numtyp rdr, 
-                            const numtyp rdrho, const int nrho, const int nr,
-                            const int t_per_atom) {
+#else
+
+#define store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,           \
+                        eflag,vflag,engv,rdrho,nrho,i,rhomax)               \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1)                           \
+        rho += shfl_xor(rho, s, t_per_atom);                                \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    numtyp p = rho*rdrho + (numtyp)1.0;                                     \
+    int m=p;                                                                \
+    m = MAX(1,MIN(m,nrho-1));                                               \
+    p -= m;                                                                 \
+    p = MIN(p,(numtyp)1.0);                                                 \
+    int index = type2frho[itype]*(nrho+1)+m;                                \
+    numtyp4 coeff; fetch4(coeff,index,frho_sp1_tex);                        \
+    numtyp fp = (coeff.x*p + coeff.y)*p + coeff.z;                          \
+    fp_[i]=fp;                                                              \
+    if (eflag>0) {                                                          \
+      fetch4(coeff,index,frho_sp2_tex);                                     \
+      energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;             \
+      if (rho > rhomax) energy += fp*(rho-rhomax);                          \
+      engv[ii]=(acctyp)2.0*energy;                                          \
+    }                                                                       \
+  }
+
+#define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom,     \
+                          offset, eflag, vflag, ans, engv)                  \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        energy += shfl_xor(energy, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv+=energy;                                                        \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv=virial[i];                                                    \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+  }
+
+#endif
+
+__kernel void k_energy(const __global numtyp4 *restrict x_, 
+                       const __global int2 *restrict type2rhor_z2r,
+                       const __global int *restrict type2frho, 
+                       const __global numtyp4 *restrict rhor_spline2, 
+                       const __global numtyp4 *restrict frho_spline1,
+                       const __global numtyp4 *restrict frho_spline2,
+                       const __global int *dev_nbor, 
+                       const __global int *dev_packed,
+                       __global numtyp *restrict fp_, 
+                       __global acctyp *restrict engv, 
+                       const int eflag, const int inum, const int nbor_pitch,
+                       const int ntypes,  const numtyp cutforcesq, 
+                       const numtyp rdr, const numtyp rdrho, 
+                       const numtyp rhomax, const int nrho,
+                       const int nr, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -144,7 +209,7 @@ __kernel void k_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
  acctyp energy = (acctyp)0;
   
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -180,24 +245,26 @@ __kernel void k_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
    } // for nbor
    
    store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
-        eflag,vflag,engv,rdrho,nrho,i);
+        eflag,vflag,engv,rdrho,nrho,i,rhomax);
  } // if ii
 }

-__kernel void k_energy_fast(__global numtyp4 *x_, 
-                                 __global int2 *type2rhor_z2r_in,
-                                 __global int *type2frho_in, 
-                                 __global numtyp4 *rhor_spline2, 
-                                 __global numtyp4 *frho_spline1,
-                                 __global numtyp4 *frho_spline2,
-                                 __global int *dev_nbor, 
-                                 __global int *dev_packed, __global numtyp *fp_, 
-                                 __global acctyp *engv, const int eflag, 
-                                 const int inum, const int nbor_pitch,
-                                 const int ntypes, const numtyp cutforcesq, 
-                                 const numtyp rdr, const numtyp rdrho,
-                                 const int nrho, const int nr, 
-                                 const int t_per_atom) {
+__kernel void k_energy_fast(const __global numtyp4 *restrict x_, 
+                            const __global int2 *restrict type2rhor_z2r_in,
+                            const __global int *restrict type2frho_in, 
+                            const __global numtyp4 *restrict rhor_spline2, 
+                            const __global numtyp4 *restrict frho_spline1,
+                            const __global numtyp4 *restrict frho_spline2,
+                            const __global int *dev_nbor, 
+                            const __global int *dev_packed, 
+                            __global numtyp *restrict fp_, 
+                            __global acctyp *restrict engv, 
+                            const int eflag,  const int inum, 
+                            const int nbor_pitch, const int ntypes, 
+                            const numtyp cutforcesq,  const numtyp rdr,
+                            const numtyp rdrho, const numtyp rhomax, 
+                            const int nrho, const int nr, 
+                            const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -218,7 +285,7 @@ __kernel void k_energy_fast(__global numtyp4 *x_,
  __syncthreads(); 

  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -254,22 +321,24 @@ __kernel void k_energy_fast(__global numtyp4 *x_,
    } // for nbor
    
    store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
-                    eflag,vflag,engv,rdrho,nrho,i);
+                    eflag,vflag,engv,rdrho,nrho,i,rhomax);
  } // if ii
 }

-__kernel void k_eam(__global numtyp4 *x_, __global numtyp *fp_,
-                          __global int2 *type2rhor_z2r,
-                          __global numtyp4 *rhor_spline1, 
-                          __global numtyp4 *z2r_spline1,
-                          __global numtyp4 *z2r_spline2,
-                          __global int *dev_nbor, __global int *dev_packed, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, 
-                          const int inum, const int nbor_pitch,
-                          const int ntypes, const numtyp cutforcesq, 
-                          const numtyp rdr, const int nr,
-                          const int t_per_atom) {
+__kernel void k_eam(const __global numtyp4 *restrict x_, 
+                    const __global numtyp *fp_,
+                    const __global int2 *type2rhor_z2r,
+                    const __global numtyp4 *rhor_spline1, 
+                    const __global numtyp4 *z2r_spline1,
+                    const __global numtyp4 *z2r_spline2,
+                    const __global int *dev_nbor,
+                    const __global int *dev_packed, 
+                    __global acctyp4 *ans,
+                    __global acctyp *engv, 
+                    const int eflag, const int vflag,  const int inum,
+                    const int nbor_pitch, const int ntypes, 
+                    const numtyp cutforcesq,  const numtyp rdr, const int nr,
+                    const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -283,7 +352,7 @@ __kernel void k_eam(__global numtyp4 *x_, __global numtyp *fp_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -364,18 +433,19 @@ __kernel void k_eam(__global numtyp4 *x_, __global numtyp *fp_,

 }

-__kernel void k_eam_fast(__global numtyp4 *x_, __global numtyp *fp_,
-                          __global int2 *type2rhor_z2r_in,
-                          __global numtyp4 *rhor_spline1, 
-                          __global numtyp4 *z2r_spline1,
-                          __global numtyp4 *z2r_spline2,
-                          __global int *dev_nbor, __global int *dev_packed, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nbor_pitch,
-                          const numtyp cutforcesq, 
-                          const numtyp rdr, const int nr,
-                          const int t_per_atom) {
+__kernel void k_eam_fast(const __global numtyp4 *x_, 
+                         const __global numtyp *fp_,
+                         const __global int2 *type2rhor_z2r_in,
+                         const __global numtyp4 *rhor_spline1, 
+                         const __global numtyp4 *z2r_spline1,
+                         const __global numtyp4 *z2r_spline2,
+                         const __global int *dev_nbor, 
+                         const __global int *dev_packed, 
+                         __global acctyp4 *ans, 
+                         __global acctyp *engv, 
+                         const int eflag, const int vflag, const int inum, 
+                         const int nbor_pitch, const numtyp cutforcesq, 
+                         const numtyp rdr, const int nr, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -395,7 +465,7 @@ __kernel void k_eam_fast(__global numtyp4 *x_, __global numtyp *fp_,
  __syncthreads();

  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_eam.h
+++ b/lib/gpu/lal_eam.h
@ -41,10 +41,10 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
  int init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
           int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline,
           double ***host_z2r_spline, double ***host_frho_spline, double rdr,
-           double rdrho, int nrhor, int nrho, int nz2r, int nfrho, int nr,
-           const int nlocal, const int nall, const int max_nbors,
-           const int maxspecial, const double cell_size, const double gpu_split,
-           FILE *_screen);
+           double rdrho, double rhomax, int nrhor, int nrho, int nz2r, 
+           int nfrho, int nr, const int nlocal, const int nall, 
+           const int max_nbors, const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *_screen);
  
  // Copy charges to device asynchronously
  inline void add_fp_data() {
@ -112,7 +112,7 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
  UCL_D_Vec<numtyp4> frho_spline1, frho_spline2;
  UCL_D_Vec<numtyp4> rhor_spline1, rhor_spline2;
    
-  numtyp _cutforcesq,_rdr,_rdrho;
+  numtyp _cutforcesq,_rdr,_rdrho, _rhomax;
  
  int _nfrho,_nrhor,_nrho,_nz2r,_nr;
  
--- a/lib/gpu/lal_eam_ext.cpp
+++ b/lib/gpu/lal_eam_ext.cpp
@ -31,7 +31,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
                 int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
                 double ***host_rhor_spline, double ***host_z2r_spline,
                 double ***host_frho_spline,
-                 double rdr, double rdrho, int nrhor, 
+                 double rdr, double rdrho, double rhomax, int nrhor, 
                 int nrho, int nz2r, int nfrho, int nr, 
                 const int nlocal, const int nall, const int max_nbors, 
                 const int maxspecial, const double cell_size, 
@ -66,9 +66,9 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
  if (world_me==0)
    init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                       host_type2frho, host_rhor_spline, host_z2r_spline,
-                       host_frho_spline, rdr, rdrho, nrhor, nrho, nz2r, nfrho,
-                       nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split,
-                       screen);
+                       host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, 
+                       nfrho, nr, nlocal, nall, 300, maxspecial, cell_size,
+                       gpu_split, screen);

  EAMMF.device->world_barrier();
  if (message)
@ -86,9 +86,9 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
    if (gpu_rank==i && world_me!=0)
      init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                         host_type2frho, host_rhor_spline, host_z2r_spline,
-                         host_frho_spline, rdr, rdrho, nrhor, nrho, nz2r, nfrho,
-                         nr, nlocal, nall, 300, maxspecial, cell_size, 
-                         gpu_split, screen);
+                         host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, 
+                         nz2r, nfrho, nr, nlocal, nall, 300, maxspecial,
+                         cell_size, gpu_split, screen);

    EAMMF.device->gpu_barrier();
    if (message) 
--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@ -19,7 +19,7 @@
 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};

 #ifdef NV_KERNEL
-#include "lal_preprocessor.h"
+#include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex, quat_tex;
 #else
@ -30,11 +30,6 @@ texture<int4,1> pos_tex, quat_tex;
 #define quat_tex qif
 #endif

-#define atom_info(t_per_atom, ii, tid, offset)                               \
-  tid=THREAD_ID_X;                                                           \
-  offset=tid & (t_per_atom-1);                                               \
-  ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
-
 #define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset,           \
                    i, numj, stride, list_end, nbor)                         \
    nbor=nbor_mem+ii;                                                        \
@ -42,55 +37,11 @@ texture<int4,1> pos_tex, quat_tex;
    nbor+=nbor_stride;                                                       \
    numj=*nbor;                                                              \
    nbor+=nbor_stride;                                                       \
-    list_end=nbor+fast_mul(nbor_stride,numj);                                   \
-    nbor+=fast_mul(offset,nbor_stride);                                         \
+    list_end=nbor+fast_mul(nbor_stride,numj);                                \
+    nbor+=fast_mul(offset,nbor_stride);                                      \
    stride=fast_mul(t_per_atom,nbor_stride);

-#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
-  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
-      }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
-      }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    engv+=ii;                                                               \
-    if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
-      engv+=inum;                                                           \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
-        engv+=inum;                                                         \
-      }                                                                     \
-    }                                                                       \
-    ans[ii]=f;                                                              \
-  }
+#if (ARCH < 300)

 #define store_answers_t(f, tor, energy, virial, ii, astride, tid,           \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
@ -195,6 +146,80 @@ texture<int4,1> pos_tex, quat_tex;
    ans[ii]=old;                                                            \
  }

+#else
+
+#define store_answers_t(f, tor, energy, virial, ii, astride, tid,           \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
+        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
+        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
+        energy += shfl_xor(energy, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    __global acctyp *ap1=engv+ii;                                           \
+    if (eflag>0) {                                                          \
+      *ap1=energy;                                                          \
+      ap1+=astride;                                                         \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *ap1=virial[i];                                                     \
+        ap1+=astride;                                                       \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+    ans[ii+astride]=tor;                                                    \
+  }
+
+#define acc_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset,   \
+                    eflag, vflag, ans, engv)                                \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        energy += shfl_xor(energy, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv+=energy;                                                        \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv+=virial[i];                                                   \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }
+
+#endif
+
 /* ----------------------------------------------------------------------
   dot product of 2 vectors
 ------------------------------------------------------------------------- */
--- a/lib/gpu/lal_ellipsoid_nbor.cu
+++ b/lib/gpu/lal_ellipsoid_nbor.cu
@ -29,22 +29,24 @@ texture<int4,1> pos_tex;
 // -- Only unpack neighbors matching the specified inclusive range of forms
 // -- Only unpack neighbors within cutoff
 // ---------------------------------------------------------------------------
-__kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form, 
-                          const int ntypes, __global int *dev_nbor,
+__kernel void kernel_nbor(const __global numtyp4 *restrict x_, 
+                          const __global numtyp2 *restrict cut_form, 
+                          const int ntypes, 
+                          __global int *dev_nbor,
                          const int nbor_pitch, const int start, const int inum, 
-                          __global int *dev_ij, const int form_low, 
-                          const int form_high) {
+                          const __global int *dev_ij, 
+                          const int form_low, const int form_high) {
                                
  // ii indexes the two interacting particles in gi
  int ii=GLOBAL_ID_X+start;

  if (ii<inum) {
-    __global int *nbor=dev_ij+ii;
+    const __global int *nbor=dev_ij+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
+    const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
@ -84,11 +86,13 @@ __kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
 // -- Only unpack neighbors within cutoff
 // -- Fast version of routine that uses shared memory for LJ constants
 // ---------------------------------------------------------------------------
-__kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
-                               __global int *dev_nbor, const int nbor_pitch, 
-                               const int start, const int inum, 
-                               __global int *dev_ij, const int form_low, 
-                               const int form_high) {
+__kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_, 
+                               const __global numtyp2 *restrict cut_form,
+                               __global int *dev_nbor, 
+                               const int nbor_pitch, const int start, 
+                               const int inum, 
+                               const __global int *dev_ij, 
+                               const int form_low, const int form_high) {
                                
  int ii=THREAD_ID_X;
  __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@ -101,12 +105,12 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
  __syncthreads();

  if (ii<inum) {
-    __global int *nbor=dev_ij+ii;
+    const __global int *nbor=dev_ij+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
+    const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
  
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@ -24,12 +24,15 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif

-__kernel void k_gauss(__global numtyp4 *x_, __global numtyp4 *gauss1,
+__kernel void k_gauss(const __global numtyp4 *restrict x_, 
+                      const __global numtyp4 *restrict gauss1,
                      const int lj_types, 
-                      __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                      __global int *dev_packed, __global acctyp4 *ans,
-                      __global acctyp *engv, const int eflag, 
-                      const int vflag, const int inum,
+                      const __global numtyp *restrict sp_lj_in, 
+                      const __global int *dev_nbor, 
+                      const __global int *dev_packed, 
+                      __global acctyp4 *restrict ans,
+                      __global acctyp *restrict engv, 
+                      const int eflag, const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
@ -48,7 +51,7 @@ __kernel void k_gauss(__global numtyp4 *x_, __global numtyp4 *gauss1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -104,10 +107,13 @@ __kernel void k_gauss(__global numtyp4 *x_, __global numtyp4 *gauss1,
  } // if ii
 }

-__kernel void k_gauss_fast(__global numtyp4 *x_, __global numtyp4 *gauss1_in,
-                           __global numtyp* sp_lj_in, 
-                           __global int *dev_nbor, __global int *dev_packed, 
-                           __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_gauss_fast(const __global numtyp4 *restrict x_, 
+                           const __global numtyp4 *restrict gauss1_in,
+                           const __global numtyp *restrict sp_lj_in, 
+                           const __global int *dev_nbor,
+                           const __global int *dev_packed, 
+                           __global acctyp4 *restrict ans,
+                           __global acctyp *restrict engv, 
                           const int eflag, const int vflag, const int inum, 
                           const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
@ -131,7 +137,7 @@ __kernel void k_gauss_fast(__global numtyp4 *x_, __global numtyp4 *gauss1_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@ -80,13 +80,20 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape
 		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
 }

-__kernel void k_gayberne(__global numtyp4* x_,__global numtyp4 *q,
-                         __global numtyp4* shape, __global numtyp4* well, 
-                         __global numtyp *gum, __global numtyp2* sig_eps, 
-                         const int ntypes, __global numtyp *lshape, 
-                         __global int *dev_nbor, const int stride, 
-                         __global acctyp4 *ans, const int astride, 
-                         __global acctyp *engv, __global int *err_flag, 
+__kernel void k_gayberne(const __global numtyp4 *restrict x_,
+                         const __global numtyp4 *restrict q,
+                         const __global numtyp4 *restrict shape, 
+                         const __global numtyp4 *restrict well, 
+                         const __global numtyp *restrict gum, 
+                         const __global numtyp2 *restrict sig_eps, 
+                         const int ntypes, 
+                         const __global numtyp *restrict lshape, 
+                         const __global int *dev_nbor, 
+                         const int stride, 
+                         __global acctyp4 *restrict ans, 
+                         const int astride, 
+                         __global acctyp *restrict engv, 
+                         __global int *restrict err_flag, 
                         const int eflag, const int vflag, const int inum,
                         const int t_per_atom) {
  int tid, ii, offset;
@ -112,7 +119,7 @@ __kernel void k_gayberne(__global numtyp4* x_,__global numtyp4 *q,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    __global int *nbor, *nbor_end;
+    const __global int *nbor, *nbor_end;
    int i, numj, n_stride;
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                n_stride,nbor_end,nbor);
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@ -17,15 +17,22 @@
 #include "lal_ellipsoid_extra.h"
 #endif

-__kernel void k_gayberne_sphere_ellipsoid(__global numtyp4 *x_,
-                  __global numtyp4 *q, __global numtyp4* shape,
-                  __global numtyp4* well, __global numtyp *gum, 
-                  __global numtyp2* sig_eps, const int ntypes, 
-                  __global numtyp *lshape, __global int *dev_nbor, 
-                  const int stride, __global acctyp4 *ans, 
-                  __global acctyp *engv, __global int *err_flag, 
-                  const int eflag, const int vflag,const int start, 
-                  const int inum, const int t_per_atom) {
+__kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
+                                          const __global numtyp4 *restrict q, 
+                                          const __global numtyp4 *restrict shape,
+                                          const __global numtyp4 *restrict well, 
+                                          const __global numtyp *restrict gum, 
+                                          const __global numtyp2 *restrict sig_eps,
+                                          const int ntypes, 
+                                          const __global numtyp *restrict lshape,
+                                          const __global int *dev_nbor, 
+                                          const int stride,
+                                          __global acctyp4 *restrict ans, 
+                                          __global acctyp *restrict engv,
+                                          __global int *restrict err_flag, 
+                                          const int eflag, const int vflag,
+                                          const int start, const int inum, 
+                                          const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  ii+=start;
@ -46,7 +53,7 @@ __kernel void k_gayberne_sphere_ellipsoid(__global numtyp4 *x_,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    __global int *nbor, *nbor_end;
+    const __global int *nbor, *nbor_end;
    int i, numj, n_stride;
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                n_stride,nbor_end,nbor);
@ -236,11 +243,16 @@ __kernel void k_gayberne_sphere_ellipsoid(__global numtyp4 *x_,
  } // if ii
 }

-__kernel void k_gayberne_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
-                            __global numtyp4* lj3, const int lj_types, 
-                            __global numtyp *gum, const int stride, 
-                            __global int *dev_ij, __global acctyp4 *ans, 
-                            __global acctyp *engv, __global int *err_flag, 
+__kernel void k_gayberne_lj(const __global numtyp4 *restrict x_, 
+                            const __global numtyp4 *restrict lj1, 
+                            const __global numtyp4 *restrict lj3, 
+                            const int lj_types, 
+                            const __global numtyp *restrict gum, 
+                            const int stride, 
+                            const __global int *dev_ij, 
+                            __global acctyp4 *restrict ans, 
+                            __global acctyp *restrict engv, 
+                            __global int *restrict err_flag, 
                            const int eflag, const int vflag, const int start,
                            const int inum, const int t_per_atom) {
  int tid, ii, offset;
@ -263,7 +275,7 @@ __kernel void k_gayberne_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                n_stride,list_end,nbor);
@ -318,13 +330,18 @@ __kernel void k_gayberne_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void k_gayberne_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
-                                 __global numtyp4* lj3_in, __global numtyp *gum, 
-                                 const int stride, __global int *dev_ij,
-                                 __global acctyp4 *ans, __global acctyp *engv,
-                                 __global int *err_flag, const int eflag,
-                                 const int vflag, const int start, 
-                                 const int inum, const int t_per_atom) {
+__kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_, 
+                                 const __global numtyp4 *restrict lj1_in, 
+                                 const __global numtyp4 *restrict lj3_in, 
+                                 const __global numtyp *restrict gum, 
+                                 const int stride, 
+                                 const __global int *dev_ij,
+                                 __global acctyp4 *restrict ans, 
+                                 __global acctyp *restrict engv,
+                                 __global int *restrict err_flag, 
+                                 const int eflag, const int vflag, 
+                                 const int start, const int inum, 
+                                 const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  ii+=start;
@ -352,7 +369,7 @@ __kernel void k_gayberne_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                n_stride,list_end,nbor);
--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@ -24,13 +24,17 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif

-__kernel void k_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
+__kernel void k_lj(const __global numtyp4 *restrict x_, 
+                   const __global numtyp4 *restrict lj1,
+                   const __global numtyp4 *restrict lj3, 
+                   const int lj_types, 
+                   const __global numtyp *restrict sp_lj_in, 
+                   const __global int * dev_nbor, 
+                   const __global int * dev_packed, 
+                   __global acctyp4 *restrict ans, 
+                   __global acctyp *restrict engv, 
+                   const int eflag, const int vflag, const int inum,
+                   const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -48,7 +52,7 @@ __kernel void k_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -103,13 +107,16 @@ __kernel void k_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void k_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, const int t_per_atom) {
+__kernel void k_lj_fast(const __global numtyp4 *restrict x_, 
+                        const __global numtyp4 *restrict lj1_in,
+                        const __global numtyp4 *restrict lj3_in, 
+                        const __global numtyp *restrict sp_lj_in, 
+                        const __global int * dev_nbor, 
+                        const __global int * dev_packed, 
+                        __global acctyp4 *restrict ans, 
+                        __global acctyp *restrict engv, 
+                        const int eflag, const int vflag, const int inum, 
+                        const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -134,7 +141,7 @@ __kernel void k_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@ -24,13 +24,17 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif

-__kernel void k_lj96(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
+__kernel void k_lj96(const __global numtyp4 *restrict x_,
+                     const __global numtyp4 *restrict lj1,
+                     const __global numtyp4 *restrict lj3, 
+                     const int lj_types, 
+                     const __global numtyp *restrict sp_lj_in, 
+                     const __global int *dev_nbor, 
+                     const __global int *dev_packed, 
+                     __global acctyp4 *restrict ans,
+                     __global acctyp *restrict engv, 
+                     const int eflag, const int vflag, const int inum,
+                     const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -48,7 +52,7 @@ __kernel void k_lj96(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -104,13 +108,16 @@ __kernel void k_lj96(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void k_lj96_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, const int t_per_atom) {
+__kernel void k_lj96_fast(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict lj1_in,
+                          const __global numtyp4 *restrict lj3_in, 
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum, 
+                          const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -135,7 +142,7 @@ __kernel void k_lj96_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@ -29,15 +29,20 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif

-__kernel void k_lj_class2_long(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_,
-                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp g_ewald, const int t_per_atom) {
+__kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
+                               const __global numtyp4 *restrict lj1,
+                               const __global numtyp4 *restrict lj3,
+                               const int lj_types, 
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag,  const int vflag, 
+                               const int inum, const int nbor_pitch, 
+                               const __global numtyp *restrict q_, 
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -60,7 +65,7 @@ __kernel void k_lj_class2_long(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -141,15 +146,21 @@ __kernel void k_lj_class2_long(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void k_lj_class2_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const int t_per_atom) {
+__kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_, 
+                                    const __global numtyp4 *restrict lj1_in,
+                                    const __global numtyp4 *restrict lj3_in, 
+                                    const __global numtyp *restrict sp_lj_in,
+                                    const __global int *dev_nbor, 
+                                    const __global int *dev_packed,
+                                    __global acctyp4 *restrict ans, 
+                                    __global acctyp *restrict engv, 
+                                    const int eflag, const int vflag, 
+                                    const int inum, const int nbor_pitch, 
+                                    const __global numtyp *restrict q_,
+                                    const numtyp cut_coulsq, 
+                                    const numtyp qqrd2e,
+                                    const numtyp g_ewald, 
+                                    const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -175,7 +186,7 @@ __kernel void k_lj_class2_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@ -29,15 +29,20 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif

-__kernel void k_lj_coul(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_ ,
-                          __global numtyp *cutsq, const numtyp qqrd2e,
-                          const int t_per_atom) {
+__kernel void k_lj_coul(const __global numtyp4 *restrict x_, 
+                        const __global numtyp4 *restrict lj1,
+                        const __global numtyp4 *restrict  lj3, 
+                        const int lj_types, 
+                        const __global numtyp *restrict sp_lj_in,
+                        const __global int *dev_nbor, 
+                        const __global int *dev_packed, 
+                        __global acctyp4 *restrict ans,
+                        __global acctyp *restrict engv, 
+                        const int eflag, const int vflag, const int inum,
+                        const int nbor_pitch, 
+                        const __global numtyp *restrict q_, 
+                        const __global numtyp *restrict cutsq, 
+                        const numtyp qqrd2e, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -60,7 +65,7 @@ __kernel void k_lj_coul(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -132,15 +137,19 @@ __kernel void k_lj_coul(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void k_lj_coul_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               __global numtyp *_cutsq, const numtyp qqrd2e,
-                               const int t_per_atom) {
+__kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
+                             const __global numtyp4 *restrict lj1_in,
+                             const __global numtyp4 *restrict lj3_in, 
+                             const __global numtyp *restrict sp_lj_in,
+                             const __global int *dev_nbor, 
+                             const __global int *dev_packed,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv, 
+                             const int eflag, const int vflag, const int inum, 
+                             const int nbor_pitch, 
+                             const __global numtyp *restrict q_,
+                             const __global numtyp *restrict _cutsq, 
+                             const numtyp qqrd2e, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -168,7 +177,7 @@ __kernel void k_lj_coul_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@ -29,16 +29,21 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif

-__kernel void k_lj_debye_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_ ,
-                          __global numtyp *cutsq, const numtyp qqrd2e,
-                          const numtyp kappa,
-                          const int t_per_atom) {
+__kernel void k_lj_debye(const __global numtyp4 *restrict x_, 
+                         const __global numtyp4 *restrict lj1,
+                         const __global numtyp4 *restrict lj3, 
+                         const int lj_types, 
+                         const __global numtyp *restrict sp_lj_in, 
+                         const __global int *dev_nbor, 
+                         const __global int *dev_packed, 
+                         __global acctyp4 *restrict ans,
+                         __global acctyp *restrict engv,
+                         const int eflag, const int vflag, const int inum,
+                         const int nbor_pitch,
+                         const __global numtyp *restrict q_ ,
+                         const __global numtyp *restrict cutsq, 
+                         const numtyp qqrd2e, const numtyp kappa,
+                         const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -61,7 +66,7 @@ __kernel void k_lj_debye_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -139,16 +144,20 @@ __kernel void k_lj_debye_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void k_lj_debye_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               __global numtyp *_cutsq, const numtyp qqrd2e,
-                               const numtyp kappa,
-                               const int t_per_atom) {
+__kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
+                              const __global numtyp4 *restrict lj1_in,
+                              const __global numtyp4 *restrict lj3_in, 
+                              const __global numtyp *restrict sp_lj_in,
+                              const __global int *dev_nbor,
+                              const __global int *dev_packed,
+                              __global acctyp4 *restrict ans,
+                              __global acctyp *restrict engv, 
+                              const int eflag, const int vflag, const int inum, 
+                              const int nbor_pitch, 
+                              const __global numtyp *restrict q_, 
+                              const __global numtyp *restrict _cutsq,
+                              const numtyp qqrd2e, const numtyp kappa,
+                              const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -176,7 +185,7 @@ __kernel void k_lj_debye_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@ -29,15 +29,20 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif

-__kernel void k_lj_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_,
-                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp g_ewald, const int t_per_atom) {
+__kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, 
+                             const __global numtyp4 *restrict lj1,
+                             const __global numtyp4 *restrict lj3, 
+                             const int lj_types, 
+                             const __global numtyp *restrict sp_lj_in, 
+                             const __global int *dev_nbor, 
+                             const __global int *dev_packed,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv,
+                             const int eflag, const int vflag, const int inum,
+                             const int nbor_pitch, 
+                             const __global numtyp *restrict q_,
+                             const numtyp cut_coulsq, const numtyp qqrd2e,
+                             const numtyp g_ewald, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -60,7 +65,7 @@ __kernel void k_lj_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -139,15 +144,19 @@ __kernel void k_lj_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void k_lj_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const int t_per_atom) {
+__kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_, 
+                                  const __global numtyp4 *restrict lj1_in,
+                                  const __global numtyp4 *restrict lj3_in, 
+                                  const __global numtyp *restrict sp_lj_in,
+                                  const __global int *dev_nbor, 
+                                  const __global int *dev_packed,
+                                  __global acctyp4 *restrict ans, 
+                                  __global acctyp *restrict engv, 
+                                  const int eflag, const int vflag,
+                                  const int inum,  const int nbor_pitch,
+                                  const __global numtyp *restrict q_,
+                                  const numtyp cut_coulsq, const numtyp qqrd2e,
+                                  const numtyp g_ewald, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -173,7 +182,7 @@ __kernel void k_lj_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@ -31,13 +31,18 @@ texture<int2> q_tex;

 #define MY_PIS (acctyp)1.77245385090551602729

-__kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
-                       __global numtyp4* lj3, const int lj_types, 
-                       __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                       __global int *dev_packed, __global acctyp4 *ans,
-                       __global acctyp *engv, const int eflag,
-                       const int vflag, const int inum,
-                       const int nbor_pitch, __global numtyp *q_ ,
+__kernel void k_lj_dsf(const __global numtyp4 *restrict x_, 
+                       const __global numtyp4 *restrict lj1,
+                       const __global numtyp4 *restrict lj3, 
+                       const int lj_types, 
+                       const __global numtyp *restrict sp_lj_in, 
+                       const __global int *dev_nbor, 
+                       const __global int *dev_packed, 
+                       __global acctyp4 *restrict ans,
+                       __global acctyp *restrict engv, 
+                       const int eflag, const int vflag, const int inum,
+                       const int nbor_pitch, 
+                       const __global numtyp *restrict q_ ,
                       const numtyp cut_coulsq, const numtyp qqrd2e,
                       const numtyp e_shift, const numtyp f_shift, 
                       const numtyp alpha, const int t_per_atom) {
@ -63,7 +68,7 @@ __kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -72,6 +77,12 @@ __kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

+    if (eflag>0) {
+      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * 
+        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
+      e_coul += (acctyp)2.0*e_self;
+    }
+
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;

@ -107,8 +118,8 @@ __kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
          numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
          erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
-          forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd + 
-                                   rsq*f_shift);
+          forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + 
+            rsq*f_shift);
        } else
          forcecoul = (numtyp)0.0;

@ -144,12 +155,17 @@ __kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                            __global numtyp4* lj3_in, __global numtyp* sp_lj_in,
-                            __global int *dev_nbor, __global int *dev_packed,
-                            __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, 
+                            const __global numtyp4 *restrict lj1_in,
+                            const __global numtyp4 *restrict lj3_in,
+                            const __global numtyp *restrict sp_lj_in,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv, 
                            const int eflag, const int vflag, const int inum, 
-                            const int nbor_pitch, __global numtyp *q_,
+                            const int nbor_pitch,
+                            const __global numtyp *restrict q_,
                            const numtyp cut_coulsq, const numtyp qqrd2e,
                            const numtyp e_shift, const numtyp f_shift, 
                            const numtyp alpha, const int t_per_atom) {
@ -178,7 +194,7 @@ __kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -188,6 +204,12 @@ __kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

+    if (eflag>0) {
+      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * 
+        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
+      e_coul += (acctyp)2.0*e_self;
+    }
+
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;

@ -222,8 +244,8 @@ __kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
          numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
          erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
-          forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd + 
-                                   rsq*f_shift);
+          forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + 
+            rsq*f_shift);
        } else
          forcecoul = (numtyp)0.0;

--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@ -26,12 +26,16 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif

-__kernel void k_lj_expand(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
+__kernel void k_lj_expand(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict lj1,
+                          const __global numtyp4 *restrict lj3, 
+                          const int lj_types, 
+                          const __global numtyp *restrict sp_lj_in, 
+                          const __global int *dev_nbor, 
+                          const __global int *dev_packed, 
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
@ -50,7 +54,7 @@ __kernel void k_lj_expand(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -108,11 +112,14 @@ __kernel void k_lj_expand(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void k_lj_expand_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_, 
+                               const __global numtyp4 *restrict lj1_in,
+                               const __global numtyp4 *restrict lj3_in, 
+                               const __global numtyp *restrict sp_lj_in, 
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans, 
+                               __global acctyp *restrict engv, 
                               const int eflag, const int vflag, const int inum, 
                               const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
@ -139,7 +146,7 @@ __kernel void k_lj_expand_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@ -26,13 +26,17 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif

-__kernel void k_morse(__global numtyp4 *x_, __global numtyp4 *mor1,
-                          __global numtyp2* mor2, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
+__kernel void k_morse(const __global numtyp4 *restrict x_, 
+                      const __global numtyp4 *restrict mor1,
+                      const __global numtyp2 *restrict mor2, 
+                      const int lj_types, 
+                      const __global numtyp *restrict sp_lj_in, 
+                      const __global int *dev_nbor, 
+                      const __global int *dev_packed, 
+                      __global acctyp4 *restrict ans,
+                      __global acctyp *restrict engv,
+                      const int eflag, const int vflag, const int inum,
+                      const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -50,7 +54,7 @@ __kernel void k_morse(__global numtyp4 *x_, __global numtyp4 *mor1,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -106,13 +110,16 @@ __kernel void k_morse(__global numtyp4 *x_, __global numtyp4 *mor1,
  } // if ii
 }

-__kernel void k_morse_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
-                               __global numtyp2* mor2_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, const int t_per_atom) {
+__kernel void k_morse_fast(const __global numtyp4 *restrict x_, 
+                           const __global numtyp4 *restrict mor1_in,
+                           const __global numtyp2 *restrict mor2_in, 
+                           const __global numtyp *restrict sp_lj_in,
+                           const __global int *dev_nbor,
+                           const __global int *dev_packed,
+                           __global acctyp4 *restrict ans,
+                           __global acctyp *restrict engv, 
+                           const int eflag, const int vflag, const int inum, 
+                           const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -137,7 +144,7 @@ __kernel void k_morse_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_neighbor_cpu.cu
+++ b/lib/gpu/lal_neighbor_cpu.cu
@ -17,7 +17,8 @@
 #include "lal_preprocessor.h"
 #endif

-__kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij,
+__kernel void kernel_unpack(__global int *dev_nbor, 
+                            const __global int *dev_ij,
                            const int inum, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int offset=tid & (t_per_atom-1);
@ -27,8 +28,8 @@ __kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij,
    __global int *nbor=dev_nbor+ii+inum;
    int numj=*nbor;
    nbor+=inum;
-    __global int *list=dev_ij+*nbor;
-    __global int *list_end=list+numj;
+    const __global int *list=dev_ij+*nbor;
+    const __global int *list_end=list+numj;
    list+=offset;
    nbor+=fast_mul(ii,t_per_atom-1)+offset;
    int stride=fast_mul(t_per_atom,inum);
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -22,7 +22,9 @@ texture<float4> pos_tex;
 texture<int4,1> pos_tex;
 #endif

-__kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id,
+__kernel void calc_cell_id(const numtyp4 *restrict pos, 
+                           unsigned *restrict cell_id, 
+                           int *restrict particle_id,
                           numtyp boxlo0, numtyp boxlo1, numtyp boxlo2, 
                           numtyp i_cell_size, int ncellx, int ncelly, 
                           int ncellz, int inum, int nall, 
@ -62,8 +64,9 @@ __kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id,
  }
 }

-__kernel void kernel_calc_cell_counts(unsigned *cell_id,
-                                      int *cell_counts, int nall, int ncell) {
+__kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
+                                      int *restrict cell_counts, 
+                                      int nall, int ncell) {
  int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < nall) {
    int id = cell_id[idx];
@ -94,8 +97,9 @@ __kernel void kernel_calc_cell_counts(unsigned *cell_id,



-__kernel void transpose(__global int *out, __global int *in, int columns_in, 
-                        int rows_in)
+__kernel void transpose(__global int *restrict out, 
+                        const __global int *restrict in, 
+                        int columns_in, int rows_in)
 {
 	__local int block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
 	
@ -117,9 +121,9 @@ __kernel void transpose(__global int *out, __global int *in, int columns_in,
 		out[j*rows_in+i] = block[ti][tj];
 }

-__kernel void calc_neigh_list_cell(__global numtyp4 *x_, 
-                                   __global int *cell_particle_id, 
-                                   __global int *cell_counts, 
+__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, 
+                                   const __global int *restrict cell_particle_id, 
+                                   const __global int *restrict cell_counts, 
                                   __global int *nbor_list,
                                   __global int *host_nbor_list, 
                                   __global int *host_numj, 
@ -234,8 +238,10 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,

 __kernel void kernel_special(__global int *dev_nbor, 
                             __global int *host_nbor_list, 
-                             __global int *host_numj, __global int *tag,
-                             __global int *nspecial, __global int *special,
+                             const __global int *host_numj, 
+                             const __global int *restrict tag,
+                             const __global int *restrict nspecial, 
+                             const __global int *restrict special,
                             int inum, int nt, int max_nbors, int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@ -41,15 +41,18 @@ texture<int2> q_tex;
 // Number of pencils per block for charge spread
 #define BLOCK_PENCILS (PPPM_BLOCK_1D/PENCIL_SIZE)

-__kernel void particle_map(__global numtyp4 *x_,  __global numtyp *q_,
+__kernel void particle_map(const __global numtyp4 *restrict x_,  
+                           const __global numtyp *restrict q_,
                           const grdtyp delvolinv, const int nlocal, 
-                           __global int *counts, __global grdtyp4 *ans, 
+                           __global int *restrict counts, 
+                           __global grdtyp4 *restrict ans, 
                           const grdtyp b_lo_x, const grdtyp b_lo_y,
                           const grdtyp b_lo_z, const grdtyp delxinv,
                           const grdtyp delyinv, const grdtyp delzinv,
                           const int nlocal_x, const int nlocal_y,
                           const int nlocal_z, const int atom_stride,
-                           const int max_atoms, __global int *error) {
+                           const int max_atoms, 
+                           __global int *restrict error) {
  // ii indexes the two interacting particles in gi
  int ii=GLOBAL_ID_X;

@ -97,8 +100,10 @@ __kernel void particle_map(__global numtyp4 *x_,  __global numtyp *q_,

 /* --------------------------- */

-__kernel void make_rho(__global int *counts, __global grdtyp4 *atoms,
-                       __global grdtyp *brick, __global grdtyp *_rho_coeff,
+__kernel void make_rho(const __global int *restrict counts, 
+                       const __global grdtyp4 *restrict atoms,
+                       __global grdtyp *restrict brick, 
+                       const __global grdtyp *restrict _rho_coeff,
                       const int atom_stride, const int npts_x,
                       const int npts_y, const int npts_z, const int nlocal_x,
                       const int nlocal_y, const int nlocal_z,
@ -192,15 +197,17 @@ __kernel void make_rho(__global int *counts, __global grdtyp4 *atoms,
  }
 }

-__kernel void interp(__global numtyp4 *x_, __global numtyp *q_,
-                     const int nlocal, __global grdtyp4 *brick,
-                     __global grdtyp *_rho_coeff, const int npts_x,
-                     const int npts_yx, const grdtyp b_lo_x,
+__kernel void interp(const __global numtyp4 *restrict x_, 
+                     const __global numtyp *restrict q_,
+                     const int nlocal, 
+                     const __global grdtyp4 *restrict brick,
+                     const __global grdtyp *restrict _rho_coeff, 
+                     const int npts_x, const int npts_yx, const grdtyp b_lo_x,
                     const grdtyp b_lo_y, const grdtyp b_lo_z,
                     const grdtyp delxinv,  const grdtyp delyinv,
                     const grdtyp delzinv, const int order,
                     const int order2, const grdtyp qqrd2e_scale, 
-                     __global acctyp4 *ans) {
+                     __global acctyp4 *restrict ans) {
  __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
  __local grdtyp rho1d_0[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
  __local grdtyp rho1d_1[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -56,8 +56,7 @@
 //     Definition:   Default thread block size for "bio" pair styles
 //  MAX_BIO_SHARED_TYPES
 //     Definition:   Max # of atom type params can be stored in shared memory
-//     Restrictions:  MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2 &&
-//                    MAX_BIO_SHARED_TYPES>=BLOCK_BIO_PAIR
+//     Restrictions:  MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2
 //
 //*************************************************************************/

@ -80,6 +79,7 @@
 #define __kernel extern "C" __global__
 #define __local __shared__
 #define __global  
+#define restrict __restrict__
 #define atom_add atomicAdd
 #define ucl_inline static __inline__ __device__ 

@ -116,8 +116,22 @@
 #define BLOCK_NBOR_BUILD 128
 #define BLOCK_PAIR 512
 #define BLOCK_BIO_PAIR 512
+#define BLOCK_ELLIPSE 256
 #define MAX_SHARED_TYPES 11

+#ifdef _SINGLE_SINGLE
+#define shfl_xor __shfl_xor
+#else
+ucl_inline double shfl_xor(double var, int laneMask, int width) {
+  int2 tmp;
+  tmp.x = __double2hiint(var);
+  tmp.y = __double2loint(var);
+  tmp.x = __shfl_xor(tmp.x,laneMask,width);
+  tmp.y = __shfl_xor(tmp.y,laneMask,width);
+  return __hiloint2double(tmp.x,tmp.y);
+}
+#endif
+
 #endif

 #endif
@ -380,3 +394,7 @@ typedef struct _double4 double4;
 #define NEIGHMASK 0x3FFFFFFF
 ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };

+#ifndef BLOCK_ELLIPSE
+#define BLOCK_ELLIPSE BLOCK_PAIR
+#endif
+
--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@ -32,14 +32,20 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
  return ans;
 }

-__kernel void k_resquared(__global numtyp4* x_,__global numtyp4 *q,
-                          __global numtyp4* shape, __global numtyp4* well, 
-                          __global numtyp *splj, __global numtyp2* sig_eps, 
-                          const int ntypes, __global int *dev_nbor,
-                          const int stride,  __global acctyp4 *ans,
-                          const int astride, __global acctyp *engv,
-                          __global int *err_flag, const int eflag,
-                          const int vflag, const int inum,
+__kernel void k_resquared(const __global numtyp4 *restrict x_,
+                          const __global numtyp4 *restrict q,
+                          const __global numtyp4 *restrict shape, 
+                          const __global numtyp4 *restrict well, 
+                          const __global numtyp *restrict splj, 
+                          const __global numtyp2 *restrict sig_eps, 
+                          const int ntypes, 
+                          const __global int *dev_nbor,
+                          const int stride,  
+                          __global acctyp4 *restrict ans,
+                          const int astride, 
+                          __global acctyp *restrict engv,
+                          __global int *restrict err_flag, 
+                          const int eflag, const int vflag, const int inum,
                          const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
@ -68,7 +74,7 @@ __kernel void k_resquared(__global numtyp4* x_,__global numtyp4 *q,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    __global int *nbor, *nbor_end;
+    const __global int *nbor, *nbor_end;
    int i, numj, n_stride;
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                n_stride,nbor_end,nbor);
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@ -17,15 +17,132 @@
 #include "lal_ellipsoid_extra.h"
 #endif

-__kernel void k_resquared_ellipsoid_sphere(__global numtyp4* x_,
-                  __global numtyp4 *q, __global numtyp4* shape, 
-                  __global numtyp4* well, __global numtyp *splj, 
-                  __global numtyp2* sig_eps, const int ntypes, 
-                  __global int *dev_nbor, const int stride, 
-                  __global acctyp4 *ans, const int astride, 
-                  __global acctyp *engv, __global int *err_flag, 
-                   const int eflag, const int vflag, const int inum,
-                   const int t_per_atom) {
+#if (ARCH < 300)
+
+#define store_answers_rt(f, tor, energy, virial, ii, astride, tid,           \
+                         t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                        \
+    __local acctyp red_acc[7][BLOCK_PAIR];                                   \
+    red_acc[0][tid]=f.x;                                                     \
+    red_acc[1][tid]=f.y;                                                     \
+    red_acc[2][tid]=f.z;                                                     \
+    red_acc[3][tid]=tor.x;                                                   \
+    red_acc[4][tid]=tor.y;                                                   \
+    red_acc[5][tid]=tor.z;                                                   \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                          \
+      if (offset < s) {                                                      \
+        for (int r=0; r<6; r++)                                              \
+          red_acc[r][tid] += red_acc[r][tid+s];                              \
+      }                                                                      \
+    }                                                                        \
+    f.x=red_acc[0][tid];                                                     \
+    f.y=red_acc[1][tid];                                                     \
+    f.z=red_acc[2][tid];                                                     \
+    tor.x=red_acc[3][tid];                                                   \
+    tor.y=red_acc[4][tid];                                                   \
+    tor.z=red_acc[5][tid];                                                   \
+    if (eflag>0 || vflag>0) {                                                \
+      for (int r=0; r<6; r++)                                                \
+        red_acc[r][tid]=virial[r];                                           \
+      red_acc[6][tid]=energy;                                                \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                        \
+        if (offset < s) {                                                    \
+          for (int r=0; r<7; r++)                                            \
+            red_acc[r][tid] += red_acc[r][tid+s];                            \
+        }                                                                    \
+      }                                                                      \
+      for (int r=0; r<6; r++)                                                \
+        virial[r]=red_acc[r][tid];                                           \
+      energy=red_acc[6][tid];                                                \
+    }                                                                        \
+  }                                                                          \
+  if (offset==0) {                                                           \
+    __global acctyp *ap1=engv+ii;                                            \
+    if (eflag>0) {                                                           \
+      *ap1+=energy;                                                          \
+      ap1+=astride;                                                          \
+    }                                                                        \
+    if (vflag>0) {                                                           \
+      for (int i=0; i<6; i++) {                                              \
+        *ap1+=virial[i];                                                     \
+        ap1+=astride;                                                        \
+      }                                                                      \
+    }                                                                        \
+    acctyp4 old=ans[ii];                                                     \
+    old.x+=f.x;                                                              \
+    old.y+=f.y;                                                              \
+    old.z+=f.z;                                                              \
+    ans[ii]=old;                                                             \
+    old=ans[ii+astride];                                                     \
+    old.x+=tor.x;                                                            \
+    old.y+=tor.y;                                                            \
+    old.z+=tor.z;                                                            \
+    ans[ii+astride]=old;                                                     \
+  }
+
+#else
+
+#define store_answers_rt(f, tor, energy, virial, ii, astride, tid,          \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
+        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
+        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
+        energy += shfl_xor(energy, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    __global acctyp *ap1=engv+ii;                                           \
+    if (eflag>0) {                                                          \
+      *ap1+=energy;                                                         \
+      ap1+=astride;                                                         \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *ap1+=virial[i];                                                    \
+        ap1+=astride;                                                       \
+      }                                                                     \
+    }                                                                       \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+    old=ans[ii+astride];                                                    \
+    old.x+=tor.x;                                                           \
+    old.y+=tor.y;                                                           \
+    old.z+=tor.z;                                                           \
+    ans[ii+astride]=old;                                                    \
+  }
+
+#endif
+
+__kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
+                                           const __global numtyp4 *restrict q,
+                                           const __global numtyp4 *restrict shape, 
+                                           const __global numtyp4 *restrict well,
+                                           const __global numtyp *restrict splj, 
+                                           const __global numtyp2 *restrict sig_eps,
+                                           const int ntypes, 
+                                           const __global int *dev_nbor,
+                                           const int stride, 
+                                           __global acctyp4 *restrict ans,
+                                           const int astride, 
+                                           __global acctyp *restrict engv, 
+                                           __global int *restrict err_flag, 
+                                           const int eflag, const int vflag, 
+                                           const int inum, 
+                                           const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -55,7 +172,7 @@ __kernel void k_resquared_ellipsoid_sphere(__global numtyp4* x_,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    __global int *nbor, *nbor_end;
+    const __global int *nbor, *nbor_end;
    int i, numj, n_stride;
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                n_stride,nbor_end,nbor);
@ -260,86 +377,26 @@ __kernel void k_resquared_ellipsoid_sphere(__global numtyp4* x_,
      }

    } // for nbor
-
-    // Reduce answers
-    if (t_per_atom>1) {
-      __local acctyp red_acc[7][BLOCK_PAIR];
-      
-      red_acc[0][tid]=f.x;
-      red_acc[1][tid]=f.y;
-      red_acc[2][tid]=f.z;
-      red_acc[3][tid]=tor.x;
-      red_acc[4][tid]=tor.y;
-      red_acc[5][tid]=tor.z;
-  
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-      
-      f.x=red_acc[0][tid];
-      f.y=red_acc[1][tid];
-      f.z=red_acc[2][tid];
-      tor.x=red_acc[3][tid];
-      tor.y=red_acc[4][tid];
-      tor.z=red_acc[5][tid];
-  
-      if (eflag>0 || vflag>0) {
-        for (int r=0; r<6; r++)
-          red_acc[r][tid]=virial[r];
-        red_acc[6][tid]=energy;
-  
-        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-          if (offset < s) {
-            for (int r=0; r<7; r++)
-              red_acc[r][tid] += red_acc[r][tid+s];
-          }
-        }
-      
-        for (int r=0; r<6; r++)
-          virial[r]=red_acc[r][tid];
-        energy=red_acc[6][tid];
-      }
-    }
-  
-    // Store answers
-    if (offset==0) {
-      __global acctyp *ap1=engv+ii;
-      if (eflag>0) {
-        *ap1+=energy;
-        ap1+=astride;
-      }
-      if (vflag>0) {
-        for (int i=0; i<6; i++) {
-          *ap1+=virial[i];
-          ap1+=astride;
-        }
-      }
-      acctyp4 old=ans[ii];
-      old.x+=f.x;
-      old.y+=f.y;
-      old.z+=f.z;
-      ans[ii]=old;
-      
-      old=ans[ii+astride];
-      old.x+=tor.x;
-      old.y+=tor.y;
-      old.z+=tor.z;
-      ans[ii+astride]=old;
-    }
+    store_answers_rt(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
+                     vflag,ans,engv);
  } // if ii
 }

-__kernel void k_resquared_sphere_ellipsoid(__global numtyp4 *x_,
-                  __global numtyp4 *q, __global numtyp4* shape,
-                  __global numtyp4* well, __global numtyp *splj,
-                  __global numtyp2* sig_eps, const int ntypes, 
-                  __global int *dev_nbor, const int stride, 
-                  __global acctyp4 *ans, __global acctyp *engv, 
-                  __global int *err_flag, const int eflag, const int vflag,
-                  const int start, const int inum, const int t_per_atom) {
+__kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
+                                           const __global numtyp4 *restrict q, 
+                                           const __global numtyp4 *restrict shape,
+                                           const __global numtyp4 *restrict well,
+                                           const __global numtyp *restrict splj,
+                                           const __global numtyp2 *restrict sig_eps,
+                                           const int ntypes, 
+                                           const __global int *dev_nbor,
+                                           const int stride, 
+                                           __global acctyp4 *restrict ans,
+                                           __global acctyp *restrict engv, 
+                                           __global int *restrict err_flag,
+                                           const int eflag, const int vflag,
+                                           const int start, const int inum, 
+                                           const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  ii+=start;
@ -366,7 +423,7 @@ __kernel void k_resquared_sphere_ellipsoid(__global numtyp4 *x_,
    virial[i]=(acctyp)0;

  if (ii<inum) {
-    __global int *nbor, *nbor_end;
+    const __global int *nbor, *nbor_end;
    int j, numj, n_stride;
    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
                n_stride,nbor_end,nbor);
@ -525,11 +582,16 @@ __kernel void k_resquared_sphere_ellipsoid(__global numtyp4 *x_,
  } // if ii
 }

-__kernel void k_resquared_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
-                             __global numtyp4* lj3, const int lj_types, 
-                             __global numtyp *gum, const int stride, 
-                             __global int *dev_ij, __global acctyp4 *ans,
-                             __global acctyp *engv, __global int *err_flag,
+__kernel void k_resquared_lj(const __global numtyp4 *restrict x_, 
+                             const __global numtyp4 *restrict lj1, 
+                             const __global numtyp4 *restrict lj3, 
+                             const int lj_types, 
+                             const __global numtyp *restrict gum, 
+                             const int stride, 
+                             const __global int *dev_ij, 
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv, 
+                             __global int *restrict err_flag,
                             const int eflag, const int vflag, const int start,
                             const int inum, const int t_per_atom) {
  int tid, ii, offset;
@ -552,7 +614,7 @@ __kernel void k_resquared_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                n_stride,list_end,nbor);
@ -606,12 +668,18 @@ __kernel void k_resquared_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
  } // if ii
 }

-__kernel void k_resquared_lj_fast(__global numtyp4 *x_, 
-                  __global numtyp4 *lj1_in, __global numtyp4* lj3_in, 
-                  __global numtyp *gum, const int stride, __global int *dev_ij,
-                  __global acctyp4 *ans, __global acctyp *engv, 
-                  __global int *err_flag, const int eflag, const int vflag,
-                  const int start, const int inum, const int t_per_atom) {
+__kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_, 
+                                  const __global numtyp4 *restrict lj1_in, 
+                                  const __global numtyp4 *restrict lj3_in, 
+                                  const __global numtyp *restrict gum, 
+                                  const int stride,
+                                  const __global int *dev_ij,
+                                  __global acctyp4 *restrict ans,
+                                  __global acctyp *restrict engv, 
+                                  __global int *restrict err_flag,
+                                  const int eflag, const int vflag,
+                                  const int start, const int inum, 
+                                  const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  ii+=start;
@ -639,7 +707,7 @@ __kernel void k_resquared_lj_fast(__global numtyp4 *x_,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                n_stride,list_end,nbor);
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@ -39,18 +39,21 @@ typedef union {

 /// ---------------- LOOKUP -------------------------------------------------

-__kernel void k_table(__global numtyp4 *x_, __global int *tabindex,
-                       __global numtyp4* coeff2, 
-                       __global numtyp4 *coeff3,
-                       __global numtyp4 *coeff4,
-                       const int lj_types,
-                       __global numtyp *cutsq,
-                       __global numtyp* sp_lj_in, 
-                       __global int *dev_nbor, __global int *dev_packed, 
-                       __global acctyp4 *ans, __global acctyp *engv, 
-                       const int eflag, const int vflag, const int inum, 
-                       const int nbor_pitch, const int t_per_atom, 
-                       int tablength) {
+__kernel void k_table(const __global numtyp4 *restrict x_, 
+                      const __global int *restrict tabindex,
+                      const __global numtyp4 *restrict coeff2, 
+                      const __global numtyp4 *restrict coeff3,
+                      const __global numtyp4 *restrict coeff4,
+                      const int lj_types,
+                      const __global numtyp *restrict cutsq,
+                      const __global numtyp *restrict sp_lj_in, 
+                      const __global int *dev_nbor, 
+                      const __global int *dev_packed, 
+                      __global acctyp4 *restrict ans, 
+                      __global acctyp *restrict engv, 
+                      const int eflag, const int vflag, const int inum, 
+                      const int nbor_pitch, const int t_per_atom, 
+                      int tablength) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -70,7 +73,7 @@ __kernel void k_table(__global numtyp4 *x_, __global int *tabindex,
  int tlm1 = tablength - 1;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -130,17 +133,20 @@ __kernel void k_table(__global numtyp4 *x_, __global int *tabindex,
  } // if ii
 }

-__kernel void k_table_fast(__global numtyp4 *x_, __global int *tabindex,
-                            __global numtyp4* coeff2, 
-                            __global numtyp4 *coeff3,
-                            __global numtyp4 *coeff4,
-                            __global numtyp *cutsq_in,
-                            __global numtyp* sp_lj_in, 
-                            __global int *dev_nbor, __global int *dev_packed, 
-                            __global acctyp4 *ans, __global acctyp *engv, 
-                            const int eflag, const int vflag, const int inum, 
-                            const int nbor_pitch, const int t_per_atom, 
-                            int tablength) {
+__kernel void k_table_fast(const __global numtyp4 *restrict x_,
+                           const __global int *restrict tabindex,
+                           const __global numtyp4 *restrict coeff2, 
+                           const __global numtyp4 *restrict coeff3,
+                           const __global numtyp4 *restrict coeff4,
+                           const __global numtyp *restrict cutsq_in,
+                           const __global numtyp *restrict sp_lj_in, 
+                           const __global int *dev_nbor, 
+                           const __global int *dev_packed, 
+                           __global acctyp4 *restrict ans, 
+                           __global acctyp *restrict engv, 
+                           const int eflag, const int vflag, const int inum, 
+                           const int nbor_pitch, const int t_per_atom, 
+                           int tablength) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -164,7 +170,7 @@ __kernel void k_table_fast(__global numtyp4 *x_, __global int *tabindex,
  int tlm1 = tablength - 1;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -227,18 +233,21 @@ __kernel void k_table_fast(__global numtyp4 *x_, __global int *tabindex,

 /// ---------------- LINEAR -------------------------------------------------

-__kernel void k_table_linear(__global numtyp4 *x_, __global int *tabindex,
-                       __global numtyp4* coeff2, 
-                       __global numtyp4 *coeff3,
-                       __global numtyp4 *coeff4,
-                       const int lj_types,
-                       __global numtyp *cutsq,
-                       __global numtyp* sp_lj_in, 
-                       __global int *dev_nbor, __global int *dev_packed, 
-                       __global acctyp4 *ans, __global acctyp *engv, 
-                       const int eflag, const int vflag, const int inum, 
-                       const int nbor_pitch, const int t_per_atom, 
-                       int tablength) {
+__kernel void k_table_linear(const __global numtyp4 *restrict x_, 
+                             const __global int *restrict tabindex,
+                             const __global numtyp4 *restrict coeff2, 
+                             const __global numtyp4 *restrict coeff3,
+                             const __global numtyp4 *restrict coeff4,
+                             const int lj_types,
+                             const __global numtyp *restrict cutsq,
+                             const __global numtyp *restrict sp_lj_in, 
+                             const __global int *dev_nbor, 
+                             const __global int *dev_packed, 
+                             __global acctyp4 *restrict ans, 
+                             __global acctyp *restrict engv, 
+                             const int eflag, const int vflag, const int inum, 
+                             const int nbor_pitch, const int t_per_atom, 
+                             int tablength) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -258,7 +267,7 @@ __kernel void k_table_linear(__global numtyp4 *x_, __global int *tabindex,
  int tlm1 = tablength - 1;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -322,17 +331,20 @@ __kernel void k_table_linear(__global numtyp4 *x_, __global int *tabindex,
  } // if ii
 }

-__kernel void k_table_linear_fast(__global numtyp4 *x_, __global int *tabindex,
-                            __global numtyp4* coeff2, 
-                            __global numtyp4 *coeff3,
-                            __global numtyp4 *coeff4,
-                            __global numtyp *cutsq_in,
-                            __global numtyp* sp_lj_in, 
-                            __global int *dev_nbor, __global int *dev_packed, 
-                            __global acctyp4 *ans, __global acctyp *engv, 
-                            const int eflag, const int vflag, const int inum, 
-                            const int nbor_pitch, const int t_per_atom, 
-                            int tablength) {
+__kernel void k_table_linear_fast(const __global numtyp4 *restrict x_, 
+                                  const __global int *restrict tabindex,
+                                  const __global numtyp4 *restrict coeff2, 
+                                  const __global numtyp4 *restrict coeff3,
+                                  const __global numtyp4 *restrict coeff4,
+                                  const __global numtyp *restrict cutsq_in,
+                                  const __global numtyp *restrict sp_lj_in, 
+                                  const __global int *dev_nbor, 
+                                  const __global int *dev_packed, 
+                                  __global acctyp4 *restrict ans, 
+                                  __global acctyp *restrict engv, 
+                                  const int eflag, const int vflag, 
+                                  const int inum, const int nbor_pitch, 
+                                  const int t_per_atom, int tablength) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -356,7 +368,7 @@ __kernel void k_table_linear_fast(__global numtyp4 *x_, __global int *tabindex,
  int tlm1 = tablength - 1;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -423,18 +435,21 @@ __kernel void k_table_linear_fast(__global numtyp4 *x_, __global int *tabindex,

 /// ---------------- SPLINE -------------------------------------------------

-__kernel void k_table_spline(__global numtyp4 *x_, __global int *tabindex,
-                       __global numtyp4* coeff2, 
-                       __global numtyp4 *coeff3,
-                       __global numtyp4 *coeff4,
-                       const int lj_types,
-                       __global numtyp *cutsq,
-                       __global numtyp* sp_lj_in, 
-                       __global int *dev_nbor, __global int *dev_packed, 
-                       __global acctyp4 *ans, __global acctyp *engv, 
-                       const int eflag, const int vflag, const int inum, 
-                       const int nbor_pitch, const int t_per_atom, 
-                       int tablength) {
+__kernel void k_table_spline(const __global numtyp4 *restrict x_, 
+                             const __global int *restrict tabindex,
+                             const __global numtyp4 *restrict coeff2, 
+                             const __global numtyp4 *restrict coeff3,
+                             const __global numtyp4 *restrict coeff4,
+                             const int lj_types,
+                             const __global numtyp *restrict cutsq,
+                             const __global numtyp *restrict sp_lj_in, 
+                             const __global int *dev_nbor, 
+                             const __global int *dev_packed, 
+                             __global acctyp4 *restrict ans, 
+                             __global acctyp *restrict engv, 
+                             const int eflag, const int vflag, const int inum, 
+                             const int nbor_pitch, const int t_per_atom, 
+                             int tablength) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -454,7 +469,7 @@ __kernel void k_table_spline(__global numtyp4 *x_, __global int *tabindex,
  int tlm1 = tablength - 1;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -525,17 +540,20 @@ __kernel void k_table_spline(__global numtyp4 *x_, __global int *tabindex,
  } // if ii
 }

-__kernel void k_table_spline_fast(__global numtyp4 *x_, __global int *tabindex,
-                            __global numtyp4* coeff2, 
-                            __global numtyp4 *coeff3,
-                            __global numtyp4 *coeff4,
-                            __global numtyp *cutsq_in,
-                            __global numtyp* sp_lj_in, 
-                            __global int *dev_nbor, __global int *dev_packed, 
-                            __global acctyp4 *ans, __global acctyp *engv, 
-                            const int eflag, const int vflag, const int inum, 
-                            const int nbor_pitch, const int t_per_atom, 
-                            int tablength) {
+__kernel void k_table_spline_fast(const __global numtyp4 *x_, 
+                                  const __global int *tabindex,
+                                  const __global numtyp4* coeff2, 
+                                  const __global numtyp4 *coeff3,
+                                  const __global numtyp4 *coeff4,
+                                  const __global numtyp *cutsq_in,
+                                  const __global numtyp* sp_lj_in, 
+                                  const __global int *dev_nbor, 
+                                  const __global int *dev_packed, 
+                                  __global acctyp4 *ans, 
+                                  __global acctyp *engv, 
+                                  const int eflag, const int vflag, 
+                                  const int inum, const int nbor_pitch, 
+                                  const int t_per_atom, int tablength) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -559,7 +577,7 @@ __kernel void k_table_spline_fast(__global numtyp4 *x_, __global int *tabindex,
  int tlm1 = tablength - 1;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -633,19 +651,23 @@ __kernel void k_table_spline_fast(__global numtyp4 *x_, __global int *tabindex,

 /// ---------------- BITMAP -------------------------------------------------

-__kernel void k_table_bitmap(__global numtyp4 *x_, __global int *tabindex,
-                       __global int *nshiftbits, __global int *nmask,
-                       __global numtyp4* coeff2, 
-                       __global numtyp4 *coeff3,
-                       __global numtyp4 *coeff4,
-                       const int lj_types,
-                       __global numtyp *cutsq,
-                       __global numtyp* sp_lj_in, 
-                       __global int *dev_nbor, __global int *dev_packed, 
-                       __global acctyp4 *ans, __global acctyp *engv, 
-                       const int eflag, const int vflag, const int inum, 
-                       const int nbor_pitch, const int t_per_atom, 
-                       int tablength) {
+__kernel void k_table_bitmap(const __global numtyp4 *x_, 
+                             const __global int *tabindex,
+                             const __global int *nshiftbits, 
+                             const __global int *nmask,
+                             const __global numtyp4* coeff2, 
+                             const __global numtyp4 *coeff3,
+                             const __global numtyp4 *coeff4,
+                             const int lj_types,
+                             const __global numtyp *cutsq,
+                             const __global numtyp* sp_lj_in, 
+                             const __global int *dev_nbor, 
+                             const __global int *dev_packed, 
+                             __global acctyp4 *ans, 
+                             __global acctyp *engv, 
+                             const int eflag, const int vflag, const int inum, 
+                             const int nbor_pitch, const int t_per_atom, 
+                             int tablength) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -665,7 +687,7 @@ __kernel void k_table_bitmap(__global numtyp4 *x_, __global int *tabindex,
  int tlm1 = tablength - 1;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -732,18 +754,22 @@ __kernel void k_table_bitmap(__global numtyp4 *x_, __global int *tabindex,
  } // if ii
 }

-__kernel void k_table_bitmap_fast(__global numtyp4 *x_, __global int *tabindex,
-                            __global int *nshiftbits, __global int *nmask,
-                            __global numtyp4* coeff2, 
-                            __global numtyp4 *coeff3,
-                            __global numtyp4 *coeff4,
-                            __global numtyp *cutsq_in,
-                            __global numtyp* sp_lj_in, 
-                            __global int *dev_nbor, __global int *dev_packed, 
-                            __global acctyp4 *ans, __global acctyp *engv, 
-                            const int eflag, const int vflag, const int inum, 
-                            const int nbor_pitch, const int t_per_atom, 
-                            int tablength) {
+__kernel void k_table_bitmap_fast(const __global numtyp4 *x_, 
+                                  const __global int *tabindex,
+                                  const __global int *nshiftbits, 
+                                  const __global int *nmask,
+                                  const __global numtyp4* coeff2, 
+                                  const __global numtyp4 *coeff3,
+                                  const __global numtyp4 *coeff4,
+                                  const __global numtyp *cutsq_in,
+                                  const __global numtyp* sp_lj_in, 
+                                  const __global int *dev_nbor, 
+                                  const __global int *dev_packed, 
+                                  __global acctyp4 *ans, 
+                                  __global acctyp *engv, 
+                                  const int eflag, const int vflag, 
+                                  const int inum, const int nbor_pitch, 
+                                  const int t_per_atom, int tablength) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -767,7 +793,7 @@ __kernel void k_table_bitmap_fast(__global numtyp4 *x_, __global int *tabindex,
  int tlm1 = tablength - 1;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@ -24,13 +24,16 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif

-__kernel void k_yukawa(__global numtyp4 *x_, __global numtyp4 *coeff,
-                          const numtyp kappa, const int lj_types,
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
+__kernel void k_yukawa(const __global numtyp4 *restrict x_, 
+                       const __global numtyp4 *restrict coeff,
+                       const numtyp kappa, const int lj_types,
+                       const __global numtyp *restrict sp_lj_in, 
+                       const __global int *dev_nbor, 
+                       const __global int *dev_packed, 
+                       __global acctyp4 *restrict ans,
+                       __global acctyp *restrict engv, 
+                       const int eflag, const int vflag, const int inum,
+                       const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

@ -48,7 +51,7 @@ __kernel void k_yukawa(__global numtyp4 *x_, __global numtyp4 *coeff,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -105,12 +108,16 @@ __kernel void k_yukawa(__global numtyp4 *x_, __global numtyp4 *coeff,
  } // if ii
 }

-__kernel void k_yukawa_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
-                               const numtyp kappa, __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, const int t_per_atom) {
+__kernel void k_yukawa_fast(const __global numtyp4 *restrict x_, 
+                            const __global numtyp4 *restrict coeff_in,
+                            const numtyp kappa, 
+                            const __global numtyp *restrict sp_lj_in, 
+                            const __global int *dev_nbor, 
+                            const __global int *dev_packed, 
+                            __global acctyp4 *restrict ans, 
+                            __global acctyp *restrict engv, 
+                            const int eflag, const int vflag, const int inum, 
+                            const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -132,7 +139,7 @@ __kernel void k_yukawa_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
--- a/lib/gpu/lal_yukawa_colloid.cpp
+++ b/lib/gpu/lal_yukawa_colloid.cpp
@ -30,7 +30,7 @@ extern Device<PRECISION,ACC_PRECISION> device;

 template <class numtyp, class acctyp>
 YukawaColloidT::YukawaColloid() : BaseAtomic<numtyp,acctyp>(), 
-_allocated(false), _max_rad_size(0) {
+_max_rad_size(0), _allocated(false) {
 }

 template <class numtyp, class acctyp>
--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@ -29,12 +29,16 @@ texture<int2> rad_tex;
 #define rad_tex rad_
 #endif

-__kernel void k_yukawa_colloid(__global numtyp4 *x_, __global numtyp *rad_,
-                               __global numtyp4 *coeff, const int lj_types, 
-                               __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                               __global int *dev_packed, __global acctyp4 *ans,
-                               __global acctyp *engv, const int eflag, 
-                               const int vflag, const int inum,
+__kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_, 
+                               const __global numtyp *restrict rad_,
+                               const __global numtyp4 *restrict coeff, 
+                               const int lj_types, 
+                               const __global numtyp *restrict sp_lj_in, 
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed, 
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv, 
+                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch, const int t_per_atom,
                               const numtyp kappa) {
  int tid, ii, offset;
@ -54,7 +58,7 @@ __kernel void k_yukawa_colloid(__global numtyp4 *x_, __global numtyp *rad_,
    virial[i]=(acctyp)0;
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
@ -113,13 +117,17 @@ __kernel void k_yukawa_colloid(__global numtyp4 *x_, __global numtyp *rad_,
  } // if ii
 }

-__kernel void k_yukawa_colloid_fast(__global numtyp4 *x_, __global numtyp *rad_,
-                                    __global numtyp4 *coeff_in, __global numtyp *sp_lj_in,
-                                    __global int *dev_nbor, __global int *dev_packed, 
-                                    __global acctyp4 *ans, __global acctyp *engv, 
-                                    const int eflag, const int vflag, const int inum, 
-                                    const int nbor_pitch, const int t_per_atom,
-                                    const numtyp kappa) {
+__kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_, 
+                                    const __global numtyp *restrict rad_,
+                                    const __global numtyp4 *restrict coeff_in, 
+                                    const __global numtyp *restrict sp_lj_in,
+                                    const __global int *dev_nbor, 
+                                    const __global int *dev_packed, 
+                                    __global acctyp4 *restrict ans, 
+                                    __global acctyp *restrict engv, 
+                                    const int eflag, const int vflag, 
+                                    const int inum, const int nbor_pitch, 
+                                    const int t_per_atom, const numtyp kappa) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  
@ -141,7 +149,7 @@ __kernel void k_yukawa_colloid_fast(__global numtyp4 *x_, __global numtyp *rad_,
  __syncthreads();
  
  if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);