From 9656958169df763b79a91194e8c67c876c1f7d8e Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 1 Jul 2016 23:27:26 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15248
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 lib/gpu/Makefile.lammps.mingw-cross |   2 +-
 lib/gpu/Makefile.linux              |   2 +-
 lib/gpu/Makefile.mingw32-cross      |   2 +-
 lib/gpu/Makefile.mingw32-cross-mpi  |   2 +-
 lib/gpu/Makefile.mingw64-cross      |   2 +-
 lib/gpu/Makefile.mingw64-cross-mpi  |   2 +-
 lib/gpu/geryon/nvd_device.h         |  92 +++----
 lib/gpu/geryon/nvd_kernel.h         | 126 +++++-----
 lib/gpu/geryon/nvd_mat.h            |   6 +-
 lib/gpu/geryon/nvd_memory.h         | 106 ++++----
 lib/gpu/geryon/nvd_texture.h        |  24 +-
 lib/gpu/geryon/nvd_timer.h          |  30 +--
 lib/gpu/geryon/ocl_device.h         | 120 ++++-----
 lib/gpu/geryon/ocl_kernel.h         | 132 +++++-----
 lib/gpu/geryon/ocl_mat.h            |   6 +-
 lib/gpu/geryon/ocl_memory.h         | 116 ++++-----
 lib/gpu/geryon/ocl_texture.h        |   8 +-
 lib/gpu/geryon/ocl_timer.h          |  30 +--
 lib/gpu/geryon/ucl_arg_kludge.h     | 372 ++++++++++++++--------------
 lib/gpu/geryon/ucl_basemat.h        |  16 +-
 lib/gpu/geryon/ucl_copy.h           | 162 ++++++------
 lib/gpu/geryon/ucl_d_mat.h          | 112 ++++-----
 lib/gpu/geryon/ucl_d_vec.h          |  98 ++++----
 lib/gpu/geryon/ucl_h_mat.h          | 164 ++++++------
 lib/gpu/geryon/ucl_h_vec.h          | 144 +++++------
 lib/gpu/geryon/ucl_matrix.h         |  48 ++--
 lib/gpu/geryon/ucl_nv_kernel.h      |   6 +-
 lib/gpu/geryon/ucl_print.h          |  64 ++---
 lib/gpu/geryon/ucl_s_obj_help.h     |  30 +--
 lib/gpu/geryon/ucl_types.h          |  78 +++---
 lib/gpu/geryon/ucl_vector.h         |  48 ++--
 lib/gpu/lal_answer.cpp              |  46 ++--
 lib/gpu/lal_atom.cpp                |  46 ++--
 lib/gpu/lal_atom.cu                 |   6 +-
 lib/gpu/lal_atom.h                  |  60 ++---
 lib/gpu/lal_balance.h               |  18 +-
 lib/gpu/lal_base_atomic.cpp         |  20 +-
 lib/gpu/lal_base_atomic.h           |  22 +-
 lib/gpu/lal_base_charge.cpp         |  16 +-
 lib/gpu/lal_base_charge.h           |  12 +-
 lib/gpu/lal_base_dipole.cpp         |  18 +-
 lib/gpu/lal_base_dipole.h           |  12 +-
 lib/gpu/lal_base_dpd.cpp            |  18 +-
 lib/gpu/lal_base_dpd.h              |  12 +-
 lib/gpu/lal_base_ellipsoid.cpp      |  34 +--
 lib/gpu/lal_base_ellipsoid.h        |  28 +--
 lib/gpu/lal_base_three.cpp          |  52 ++--
 lib/gpu/lal_base_three.h            |  31 +--
 lib/gpu/lal_beck.cpp                |  10 +-
 lib/gpu/lal_beck.cu                 |  26 +-
 lib/gpu/lal_beck.h                  |  12 +-
 lib/gpu/lal_beck_ext.cpp            |   8 +-
 lib/gpu/lal_born.cpp                |  26 +-
 lib/gpu/lal_born.cu                 |  58 ++---
 lib/gpu/lal_born.h                  |  20 +-
 lib/gpu/lal_born_coul_long.cpp      |  28 +--
 lib/gpu/lal_born_coul_long.cu       | 268 ++++++++++----------
 lib/gpu/lal_born_coul_long.h        |  20 +-
 lib/gpu/lal_born_coul_long_ext.cpp  |  32 +--
 lib/gpu/lal_born_coul_wolf.cpp      |  30 +--
 lib/gpu/lal_born_coul_wolf.cu       |  64 ++---
 lib/gpu/lal_born_coul_wolf.h        |  20 +-
 lib/gpu/lal_born_coul_wolf_ext.cpp  |  24 +-
 lib/gpu/lal_born_ext.cpp            |  30 +--
 lib/gpu/lal_buck.cpp                |  30 +--
 lib/gpu/lal_buck.cu                 |  54 ++--
 lib/gpu/lal_buck.h                  |  18 +-
 lib/gpu/lal_buck_coul.cpp           |  30 +--
 lib/gpu/lal_buck_coul.cu            |  80 +++---
 lib/gpu/lal_buck_coul.h             |  20 +-
 lib/gpu/lal_buck_coul_ext.cpp       |  24 +-
 lib/gpu/lal_buck_coul_long.cpp      |  26 +-
 lib/gpu/lal_buck_coul_long.cu       | 276 ++++++++++-----------
 lib/gpu/lal_buck_coul_long.h        |  14 +-
 lib/gpu/lal_buck_coul_long_ext.cpp  |  20 +-
 lib/gpu/lal_buck_ext.cpp            |  24 +-
 lib/gpu/lal_cg_cmm.cpp              |  24 +-
 lib/gpu/lal_cg_cmm.cu               |  44 ++--
 lib/gpu/lal_cg_cmm.h                |  10 +-
 lib/gpu/lal_cg_cmm_ext.cpp          |  14 +-
 lib/gpu/lal_cg_cmm_long.cpp         |  24 +-
 lib/gpu/lal_cg_cmm_long.cu          |  38 +--
 lib/gpu/lal_cg_cmm_long.h           |  12 +-
 lib/gpu/lal_cg_cmm_long_ext.cpp     |  16 +-
 lib/gpu/lal_charmm_long.cpp         |  20 +-
 lib/gpu/lal_charmm_long.cu          |  42 ++--
 lib/gpu/lal_charmm_long.h           |  12 +-
 lib/gpu/lal_charmm_long_ext.cpp     |  16 +-
 lib/gpu/lal_colloid.cpp             |  30 +--
 lib/gpu/lal_colloid.cu              | 116 ++++-----
 lib/gpu/lal_colloid.h               |  20 +-
 lib/gpu/lal_colloid_ext.cpp         |  22 +-
 lib/gpu/lal_coul.cpp                |  18 +-
 lib/gpu/lal_coul.cu                 |  38 +--
 lib/gpu/lal_coul.h                  |  12 +-
 lib/gpu/lal_coul_debye.cpp          |  16 +-
 lib/gpu/lal_coul_debye.cu           |  26 +-
 lib/gpu/lal_coul_debye.h            |  12 +-
 lib/gpu/lal_coul_debye_ext.cpp      |  16 +-
 lib/gpu/lal_coul_dsf.cpp            |  18 +-
 lib/gpu/lal_coul_dsf.cu             |  56 ++---
 lib/gpu/lal_coul_dsf.h              |  10 +-
 lib/gpu/lal_coul_dsf_ext.cpp        |  22 +-
 lib/gpu/lal_coul_ext.cpp            |  18 +-
 lib/gpu/lal_coul_long.cpp           |  14 +-
 lib/gpu/lal_coul_long.cu            |  18 +-
 lib/gpu/lal_coul_long.h             |  12 +-
 lib/gpu/lal_coul_long_ext.cpp       |  42 ++--
 lib/gpu/lal_device.cpp              | 112 ++++-----
 lib/gpu/lal_device.cu               |   6 +-
 lib/gpu/lal_device.h                |  68 ++---
 lib/gpu/lal_dipole_lj.cpp           |  16 +-
 lib/gpu/lal_dipole_lj.cu            |  90 +++----
 lib/gpu/lal_dipole_lj.h             |   8 +-
 lib/gpu/lal_dipole_lj_ext.cpp       |  12 +-
 lib/gpu/lal_dipole_lj_sf.cpp        |  20 +-
 lib/gpu/lal_dipole_lj_sf.cu         | 122 ++++-----
 lib/gpu/lal_dipole_lj_sf.h          |   8 +-
 lib/gpu/lal_dipole_lj_sf_ext.cpp    |  12 +-
 lib/gpu/lal_dpd.cpp                 |  30 +--
 lib/gpu/lal_dpd.cu                  |  84 +++----
 lib/gpu/lal_dpd.h                   |  18 +-
 lib/gpu/lal_dpd_ext.cpp             |  22 +-
 lib/gpu/lal_eam.cpp                 | 134 +++++-----
 lib/gpu/lal_eam.cu                  | 136 +++++-----
 lib/gpu/lal_eam.h                   |  54 ++--
 lib/gpu/lal_eam_alloy_ext.cpp       |  32 +--
 lib/gpu/lal_eam_ext.cpp             |  32 +--
 lib/gpu/lal_eam_fs_ext.cpp          |  32 +--
 lib/gpu/lal_ellipsoid_extra.h       |  16 +-
 lib/gpu/lal_ellipsoid_nbor.cu       |  34 +--
 lib/gpu/lal_gauss.cpp               |  22 +-
 lib/gpu/lal_gauss.cu                |  60 ++---
 lib/gpu/lal_gauss.h                 |  18 +-
 lib/gpu/lal_gauss_ext.cpp           |  24 +-
 lib/gpu/lal_gayberne.cpp            |  80 +++---
 lib/gpu/lal_gayberne.cu             | 144 +++++------
 lib/gpu/lal_gayberne.h              |  26 +-
 lib/gpu/lal_gayberne_ext.cpp        |  20 +-
 lib/gpu/lal_gayberne_lj.cu          | 130 +++++-----
 lib/gpu/lal_lj.cpp                  |  30 +--
 lib/gpu/lal_lj.cu                   |  60 ++---
 lib/gpu/lal_lj.h                    |  16 +-
 lib/gpu/lal_lj96.cpp                |  18 +-
 lib/gpu/lal_lj96.cu                 |  50 ++--
 lib/gpu/lal_lj96.h                  |  10 +-
 lib/gpu/lal_lj96_ext.cpp            |   8 +-
 lib/gpu/lal_lj_class2_long.cpp      |  16 +-
 lib/gpu/lal_lj_class2_long.cu       |  42 ++--
 lib/gpu/lal_lj_class2_long.h        |   8 +-
 lib/gpu/lal_lj_class2_long_ext.cpp  |   8 +-
 lib/gpu/lal_lj_coul.cpp             |  24 +-
 lib/gpu/lal_lj_coul.cu              |  46 ++--
 lib/gpu/lal_lj_coul.h               |   8 +-
 lib/gpu/lal_lj_coul_debye.cpp       |  22 +-
 lib/gpu/lal_lj_coul_debye.cu        |  42 ++--
 lib/gpu/lal_lj_coul_debye.h         |   8 +-
 lib/gpu/lal_lj_coul_debye_ext.cpp   |  12 +-
 lib/gpu/lal_lj_coul_ext.cpp         |  10 +-
 lib/gpu/lal_lj_coul_long.cpp        |  22 +-
 lib/gpu/lal_lj_coul_long.cu         |  38 +--
 lib/gpu/lal_lj_coul_long.h          |  10 +-
 lib/gpu/lal_lj_coul_long_ext.cpp    |  18 +-
 lib/gpu/lal_lj_coul_msm.cpp         |  24 +-
 lib/gpu/lal_lj_coul_msm.cu          |  30 +--
 lib/gpu/lal_lj_coul_msm.h           |  14 +-
 lib/gpu/lal_lj_coul_msm_ext.cpp     |  10 +-
 lib/gpu/lal_lj_cubic.cpp            |  26 +-
 lib/gpu/lal_lj_cubic.cu             |  64 ++---
 lib/gpu/lal_lj_cubic.h              |  16 +-
 lib/gpu/lal_lj_cubic_ext.cpp        |  16 +-
 lib/gpu/lal_lj_dsf.cpp              |  24 +-
 lib/gpu/lal_lj_dsf.cu               |  46 ++--
 lib/gpu/lal_lj_dsf.h                |   8 +-
 lib/gpu/lal_lj_dsf_ext.cpp          |  10 +-
 lib/gpu/lal_lj_expand.cpp           |  28 +--
 lib/gpu/lal_lj_expand.cu            |  70 +++---
 lib/gpu/lal_lj_expand.h             |  14 +-
 lib/gpu/lal_lj_expand_ext.cpp       |  14 +-
 lib/gpu/lal_lj_ext.cpp              |  14 +-
 lib/gpu/lal_lj_gromacs.cpp          |  22 +-
 lib/gpu/lal_lj_gromacs.cu           |  26 +-
 lib/gpu/lal_lj_gromacs.h            |  12 +-
 lib/gpu/lal_lj_gromacs_ext.cpp      |  16 +-
 lib/gpu/lal_mie.cpp                 |  16 +-
 lib/gpu/lal_mie.cu                  |  42 ++--
 lib/gpu/lal_mie.h                   |  12 +-
 lib/gpu/lal_mie_ext.cpp             |   8 +-
 lib/gpu/lal_morse.cpp               |  24 +-
 lib/gpu/lal_morse.cu                |  48 ++--
 lib/gpu/lal_morse.h                 |  12 +-
 lib/gpu/lal_morse_ext.cpp           |  14 +-
 lib/gpu/lal_neighbor.cpp            | 119 +++++----
 lib/gpu/lal_neighbor.h              |  62 ++---
 lib/gpu/lal_neighbor_cpu.cu         |   6 +-
 lib/gpu/lal_neighbor_gpu.cu         | 118 ++++-----
 lib/gpu/lal_neighbor_shared.cpp     |   6 +-
 lib/gpu/lal_neighbor_shared.h       |   6 +-
 lib/gpu/lal_pppm.cpp                |  44 ++--
 lib/gpu/lal_pppm.cu                 |  68 ++---
 lib/gpu/lal_pppm.h                  |  28 +--
 lib/gpu/lal_pppm_ext.cpp            |  18 +-
 lib/gpu/lal_precision.h             |  16 +-
 lib/gpu/lal_preprocessor.h          |  20 +-
 lib/gpu/lal_re_squared.cpp          |  68 ++---
 lib/gpu/lal_re_squared.cu           |  40 +--
 lib/gpu/lal_re_squared.h            |  20 +-
 lib/gpu/lal_re_squared_ext.cpp      |  20 +-
 lib/gpu/lal_re_squared_lj.cu        | 140 +++++------
 lib/gpu/lal_soft.cpp                |  18 +-
 lib/gpu/lal_soft.cu                 |  32 +--
 lib/gpu/lal_soft.h                  |  14 +-
 lib/gpu/lal_soft_ext.cpp            |  16 +-
 lib/gpu/lal_sw.cpp                  |  64 ++---
 lib/gpu/lal_sw.cu                   | 128 +++++-----
 lib/gpu/lal_sw.h                    |  12 +-
 lib/gpu/lal_sw_ext.cpp              |  26 +-
 lib/gpu/lal_table.cpp               |  88 +++----
 lib/gpu/lal_table.cu                | 328 ++++++++++++------------
 lib/gpu/lal_table.h                 |  36 +--
 lib/gpu/lal_table_ext.cpp           |  14 +-
 lib/gpu/lal_tersoff.cpp             |  12 +-
 lib/gpu/lal_tersoff.cu              |  30 ++-
 lib/gpu/lal_tersoff_ext.cpp         |   2 +-
 lib/gpu/lal_tersoff_extra.h         |   2 +-
 lib/gpu/lal_tersoff_mod.cpp         |  12 +-
 lib/gpu/lal_tersoff_mod.cu          |  32 ++-
 lib/gpu/lal_tersoff_mod_ext.cpp     |   2 +-
 lib/gpu/lal_tersoff_mod_extra.h     |   4 +-
 lib/gpu/lal_tersoff_zbl.cpp         |  12 +-
 lib/gpu/lal_tersoff_zbl.cu          |  30 ++-
 lib/gpu/lal_tersoff_zbl_ext.cpp     |   2 +-
 lib/gpu/lal_tersoff_zbl_extra.h     |   2 +-
 lib/gpu/lal_yukawa.cpp              |  18 +-
 lib/gpu/lal_yukawa.cu               |  52 ++--
 lib/gpu/lal_yukawa.h                |  16 +-
 lib/gpu/lal_yukawa_colloid.cpp      |  64 ++---
 lib/gpu/lal_yukawa_colloid.cu       |  74 +++---
 lib/gpu/lal_yukawa_colloid.h        |  28 +--
 lib/gpu/lal_yukawa_colloid_ext.cpp  |  26 +-
 lib/gpu/lal_yukawa_ext.cpp          |  22 +-
 lib/gpu/lal_zbl.cpp                 |  28 +--
 lib/gpu/lal_zbl.cu                  | 108 ++++----
 lib/gpu/lal_zbl.h                   |  20 +-
 lib/gpu/lal_zbl_ext.cpp             |  20 +-
 245 files changed, 4890 insertions(+), 4832 deletions(-)

diff --git a/lib/gpu/Makefile.lammps.mingw-cross b/lib/gpu/Makefile.lammps.mingw-cross
index e92c3e9d0d..12d833c744 100644
--- a/lib/gpu/Makefile.lammps.mingw-cross
+++ b/lib/gpu/Makefile.lammps.mingw-cross
@@ -1,6 +1,6 @@
 # Settings that the LAMMPS build will import when this package library is used
 # settings for OpenCL builds
 gpu_SYSINC =
-gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -lOpenCL
+gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -Wl,-Bdynamic,-lOpenCL,-Bstatic
 gpu_SYSPATH = 
 
diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux
index 1e689a355c..d72c0ba437 100644
--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@@ -7,7 +7,7 @@
 
 EXTRAMAKE = Makefile.lammps.standard
 
-ifeq($(CUDA_HOME),)
+ifeq ($(CUDA_HOME),)
 CUDA_HOME = /usr/local/cuda
 endif
 
diff --git a/lib/gpu/Makefile.mingw32-cross b/lib/gpu/Makefile.mingw32-cross
index 3f1240af1a..6f77634755 100644
--- a/lib/gpu/Makefile.mingw32-cross
+++ b/lib/gpu/Makefile.mingw32-cross
@@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
 OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
         -mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
         -I$(CUDA_HOME)/include
-OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL -L../../src/STUBS -lmpi_mingw32
+OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic -L../../src/STUBS -lmpi_mingw32
 OCL_PREC = -D_SINGLE_DOUBLE
 OCL_TUNE = -DFERMI_OCL
 EXTRAMAKE = Makefile.lammps.mingw-cross
diff --git a/lib/gpu/Makefile.mingw32-cross-mpi b/lib/gpu/Makefile.mingw32-cross-mpi
index 6dae2d0604..94099cd90b 100644
--- a/lib/gpu/Makefile.mingw32-cross-mpi
+++ b/lib/gpu/Makefile.mingw32-cross-mpi
@@ -4,7 +4,7 @@ OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
         -mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include \
 	-I../../tools/mingw-cross/mpich2-win32/include/ \
         -DMPICH_IGNORE_CXX_SEEK
-OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL \
+OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
 	-L../../tools/mingw-cross/mpich2-win32/lib -lmpi
 OCL_PREC = -D_SINGLE_DOUBLE
 OCL_TUNE = -DFERMI_OCL
diff --git a/lib/gpu/Makefile.mingw64-cross b/lib/gpu/Makefile.mingw64-cross
index 606b0309cb..54f6af8c65 100644
--- a/lib/gpu/Makefile.mingw64-cross
+++ b/lib/gpu/Makefile.mingw64-cross
@@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
 OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
 	-msse2 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
         -I$(CUDA_HOME)/include
-OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
+OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
 	-L../../src/STUBS -lmpi_mingw64
 OCL_PREC = -D_SINGLE_DOUBLE
 OCL_TUNE = -DFERMI_OCL
diff --git a/lib/gpu/Makefile.mingw64-cross-mpi b/lib/gpu/Makefile.mingw64-cross-mpi
index cea8155efd..2ff72d98b1 100644
--- a/lib/gpu/Makefile.mingw64-cross-mpi
+++ b/lib/gpu/Makefile.mingw64-cross-mpi
@@ -5,7 +5,7 @@ OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
 	-I../../tools/mingw-cross/mpich2-win64/include/ \
         -DMPICH_IGNORE_CXX_SEEK
  
-OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
+OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
 	-L../../tools/mingw-cross/mpich2-win64/lib -lmpi
 OCL_PREC = -D_SINGLE_DOUBLE
 OCL_TUNE = -DFERMI_OCL
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index 3b7781753c..2d2a751f85 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -35,7 +35,7 @@ namespace ucl_cudadr {
 // --------------------------------------------------------------------------
 // - COMMAND QUEUE STUFF
 // --------------------------------------------------------------------------
-typedef CUstream command_queue; 
+typedef CUstream command_queue;
 
 inline void ucl_sync(CUstream &stream) {
   CU_SAFE_CALL(cuStreamSynchronize(stream));
@@ -59,21 +59,21 @@ struct NVDProperties {
 
 /// Class for looking at device properties
 /** \note Calls to change the device outside of the class results in incorrect
-  *       behavior 
+  *       behavior
   * \note There is no error checking for indexing past the number of devices **/
 class UCL_Device {
  public:
   /// Collect properties for every GPU on the node
   /** \note You must set the active GPU with set() before using the device **/
   inline UCL_Device();
-  
+
   inline ~UCL_Device();
 
   /// Returns 1 (For compatibility with OpenCL)
   inline int num_platforms() { return 1; }
 
   /// Return a string with name and info of the current platform
-  inline std::string platform_name() 
+  inline std::string platform_name()
     { return "NVIDIA Corporation NVIDIA CUDA Driver"; }
 
   /// Delete any contexts/data and set the platform number to be used
@@ -97,24 +97,24 @@ class UCL_Device {
 
   /// Returns the default stream for the current device
   inline command_queue & cq() { return cq(0); }
-  
+
   /// Returns the stream indexed by i
   inline command_queue & cq(const int i) { return _cq[i]; }
-  
+
   /// Block until all commands in the default stream have completed
   inline void sync() { sync(0); }
-  
+
   /// Block until all commands in the specified stream have completed
   inline void sync(const int i) { ucl_sync(cq(i)); }
-  
+
   /// Get the number of command queues currently available on device
-  inline int num_queues() 
+  inline int num_queues()
     { return _cq.size(); }
-  
+
   /// Add a stream for device computations
   inline void push_command_queue() {
-    _cq.push_back(CUstream()); 
-    CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0)); 
+    _cq.push_back(CUstream());
+    CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0));
   }
 
   /// Remove a stream for device computations
@@ -124,19 +124,19 @@ class UCL_Device {
     CU_SAFE_CALL_NS(cuStreamDestroy(_cq.back()));
     _cq.pop_back();
   }
-  
+
   /// Set the default command queue (by default this is the null stream)
-  /** \param i index of the command queue (as added by push_command_queue()) 
+  /** \param i index of the command queue (as added by push_command_queue())
       If i is 0, the default command queue is set to the null stream **/
   inline void set_command_queue(const int i) {
     if (i==0) _cq[0]=0;
     else _cq[0]=_cq[i];
   }
-  
+
   /// Get the current CUDA device name
   inline std::string name() { return name(_device); }
   /// Get the CUDA device name
-  inline std::string name(const int i) 
+  inline std::string name(const int i)
     { return std::string(_properties[i].name); }
 
   /// Get a string telling the type of the current device
@@ -148,38 +148,38 @@ class UCL_Device {
   inline int device_type() { return device_type(_device); }
   /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
   inline int device_type(const int i) { return UCL_GPU; }
-  
+
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory() { return shared_memory(_device); }
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
-  
+
   /// Returns true if double precision is support for the current device
   inline bool double_precision() { return double_precision(_device); }
   /// Returns true if double precision is support for the device
   inline bool double_precision(const int i) {return arch(i)>=1.3;}
-  
+
   /// Get the number of compute units on the current device
   inline unsigned cus() { return cus(_device); }
   /// Get the number of compute units
-  inline unsigned cus(const int i) 
+  inline unsigned cus(const int i)
     { return _properties[i].multiProcessorCount; }
 
   /// Get the number of cores in the current device
   inline unsigned cores() { return cores(_device); }
   /// Get the number of cores
-  inline unsigned cores(const int i) 
-    { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; 
+  inline unsigned cores(const int i)
+    { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
       else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
       else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
       else return _properties[i].multiProcessorCount*192; }
-  
+
   /// Get the gigabytes of global memory in the current device
   inline double gigabytes() { return gigabytes(_device); }
   /// Get the gigabytes of global memory
-  inline double gigabytes(const int i) 
+  inline double gigabytes(const int i)
     { return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
-  
+
   /// Get the bytes of global memory in the current device
   inline size_t bytes() { return bytes(_device); }
   /// Get the bytes of global memory
@@ -188,13 +188,13 @@ class UCL_Device {
   // Get the gigabytes of free memory in the current device
   inline double free_gigabytes() { return free_gigabytes(_device); }
   // Get the gigabytes of free memory
-  inline double free_gigabytes(const int i) 
+  inline double free_gigabytes(const int i)
     { return static_cast<double>(free_bytes(i))/1073741824; }
-  
+
   // Get the bytes of free memory in the current device
   inline size_t free_bytes() { return free_bytes(_device); }
   // Get the bytes of free memory
-  inline size_t free_bytes(const int i) { 
+  inline size_t free_bytes(const int i) {
     CUDA_INT_TYPE dfree, dtotal;
     CU_SAFE_CALL_NS(cuMemGetInfo(&dfree, &dtotal));
     return static_cast<size_t>(dfree);
@@ -203,21 +203,21 @@ class UCL_Device {
   /// Return the GPGPU compute capability for current device
   inline double arch() { return arch(_device); }
   /// Return the GPGPU compute capability
-  inline double arch(const int i) 
+  inline double arch(const int i)
     { return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
-  
+
   /// Clock rate in GHz for current device
   inline double clock_rate() { return clock_rate(_device); }
   /// Clock rate in GHz
-  inline double clock_rate(const int i) 
+  inline double clock_rate(const int i)
     { return _properties[i].p.clockRate*1e-6;}
-               
+
   /// Get the maximum number of threads per block
   inline size_t group_size() { return group_size(_device); }
   /// Get the maximum number of threads per block
-  inline size_t group_size(const int i) 
+  inline size_t group_size(const int i)
     { return _properties[i].p.maxThreadsPerBlock; }
-  
+
   /// Return the maximum memory pitch in bytes for current device
   inline size_t max_pitch() { return max_pitch(_device); }
   /// Return the maximum memory pitch in bytes
@@ -242,7 +242,7 @@ class UCL_Device {
     { return fission_by_counts(_device); }
   /// True if splitting device into subdevices by specified counts supported
   inline bool fission_by_counts(const int i)
-    { return false; }    
+    { return false; }
   /// True if splitting device into subdevices by affinity domains supported
   inline bool fission_by_affinity()
     { return fission_by_affinity(_device); }
@@ -259,7 +259,7 @@ class UCL_Device {
 
   /// List all devices along with all properties
   inline void print_all(std::ostream &out);
- 
+
  private:
   int _device, _num_devices;
   std::vector<NVDProperties> _properties;
@@ -279,16 +279,16 @@ UCL_Device::UCL_Device() {
     CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m));
     if (major==9999)
       continue;
-      
+
     _properties.push_back(NVDProperties());
     _properties.back().device_id=dev;
     _properties.back().major=major;
     _properties.back().minor=minor;
-    
+
     char namecstr[1024];
     CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
     _properties.back().name=namecstr;
-    
+
     CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
                                        CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
@@ -296,23 +296,23 @@ UCL_Device::UCL_Device() {
     CU_SAFE_CALL_NS(cuDeviceGetProperties(&_properties.back().p,m));
     #if CUDA_VERSION >= 2020
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(
-                      &_properties.back().kernelExecTimeoutEnabled, 
+                      &_properties.back().kernelExecTimeoutEnabled,
                       CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(
                       &_properties.back().integrated,
                       CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(
-                      &_properties.back().canMapHostMemory, 
+                      &_properties.back().canMapHostMemory,
                       CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
-    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().computeMode, 
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().computeMode,
                       CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev));
     #endif
     #if CUDA_VERSION >= 3010
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(
-                      &_properties.back().concurrentKernels, 
+                      &_properties.back().concurrentKernels,
                       CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(
-                      &_properties.back().ECCEnabled,  
+                      &_properties.back().ECCEnabled,
                       CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
     #endif
   }
@@ -365,7 +365,7 @@ void UCL_Device::print_all(std::ostream &out) {
   cuDriverGetVersion(&driver_version);
   out << "CUDA Driver Version:                           "
       << driver_version/1000 << "." << driver_version%100
-		  << std::endl;
+                  << std::endl;
   #endif
 
   if (num_devices() == 0)
diff --git a/lib/gpu/geryon/nvd_kernel.h b/lib/gpu/geryon/nvd_kernel.h
index e0bfb1bb5e..d03a715e1b 100644
--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -35,15 +35,15 @@ template <class numtyp> class UCL_D_Mat;
 template <class hosttype, class devtype> class UCL_Vector;
 template <class hosttype, class devtype> class UCL_Matrix;
 #define UCL_MAX_KERNEL_ARGS 256
-    
+
 /// Class storing 1 or more kernel functions from a single string or file
 class UCL_Program {
  public:
   inline UCL_Program(UCL_Device &device) { _cq=device.cq(); }
-  inline UCL_Program(UCL_Device &device, const void *program, 
-                     const char *flags="", std::string *log=NULL) { 
+  inline UCL_Program(UCL_Device &device, const void *program,
+                     const char *flags="", std::string *log=NULL) {
     _cq=device.cq();
-    init(device); 
+    init(device);
     load_string(program,flags,log);
   }
 
@@ -61,20 +61,20 @@ class UCL_Program {
                   std::string *log=NULL) {
     std::ifstream in(filename);
     if (!in || in.is_open()==false) {
-      #ifndef UCL_NO_EXIT 
-      std::cerr << "UCL Error: Could not open kernel file: " 
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not open kernel file: "
                 << filename << std::endl;
       UCL_GERYON_EXIT;
       #endif
       return UCL_FILE_NOT_FOUND;
     }
-  
+
     std::string program((std::istreambuf_iterator<char>(in)),
                         std::istreambuf_iterator<char>());
     in.close();
     return load_string(program.c_str(),flags,log);
   }
-  
+
   /// Load a program from a string and compile with flags
   inline int load_string(const void *program, const char *flags="",
                          std::string *log=NULL) {
@@ -94,12 +94,12 @@ class UCL_Program {
 
     CUresult err=cuModuleLoadDataEx(&_module,program,num_opts,
                                     options,(void **)values);
-                                        
+
     if (log!=NULL)
       *log=std::string(clog);
-      
+
     if (err != CUDA_SUCCESS) {
-      #ifndef UCL_NO_EXIT                                                 
+      #ifndef UCL_NO_EXIT
       std::cerr << std::endl
                 << "----------------------------------------------------------\n"
                 << " UCL Error: Error compiling PTX Program...\n"
@@ -108,24 +108,24 @@ class UCL_Program {
       #endif
       return UCL_COMPILE_ERROR;
     }
-    
+
     return UCL_SUCCESS;
-  }                                      
-                              
+  }
+
   /// Load a precompiled program from a file
   inline int load_binary(const char *filename) {
     CUmodule _module;
     CUresult err = cuModuleLoad(&_module,filename);
     if (err==301) {
-      #ifndef UCL_NO_EXIT 
-      std::cerr << "UCL Error: Could not open binary kernel file: " 
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not open binary kernel file: "
                 << filename << std::endl;
       UCL_GERYON_EXIT;
       #endif
       return UCL_FILE_NOT_FOUND;
     } else if (err!=CUDA_SUCCESS) {
-      #ifndef UCL_NO_EXIT 
-      std::cerr << "UCL Error: Error loading binary kernel file: " 
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Error loading binary kernel file: "
                 << filename << std::endl;
       UCL_GERYON_EXIT;
       #endif
@@ -138,7 +138,7 @@ class UCL_Program {
     //  return UCL_ERROR;
     return UCL_SUCCESS;
   }
-   
+
   friend class UCL_Kernel;
  private:
   CUmodule _module;
@@ -149,23 +149,23 @@ class UCL_Program {
 /// Class for dealing with CUDA Driver kernels
 class UCL_Kernel {
  public:
-  UCL_Kernel() : _dimensions(1), _num_args(0) { 
+  UCL_Kernel() : _dimensions(1), _num_args(0) {
     #if CUDA_VERSION < 4000
     _param_size=0;
     #endif
-    _num_blocks[0]=0; 
+    _num_blocks[0]=0;
   }
-  
-  UCL_Kernel(UCL_Program &program, const char *function) : 
+
+  UCL_Kernel(UCL_Program &program, const char *function) :
     _dimensions(1), _num_args(0) {
     #if CUDA_VERSION < 4000
     _param_size=0;
     #endif
-    _num_blocks[0]=0; 
-    set_function(program,function); 
-    _cq=program._cq; 
+    _num_blocks[0]=0;
+    set_function(program,function);
+    _cq=program._cq;
   }
-  
+
   ~UCL_Kernel() {}
 
   /// Clear any function associated with the kernel
@@ -189,7 +189,7 @@ class UCL_Kernel {
 
   /// Set the kernel argument.
   /** If not a device pointer, this must be repeated each time the argument
-    * changes 
+    * changes
     * \note To set kernel parameter i (i>0), parameter i-1 must be set **/
   template <class dtype>
   inline void set_arg(const unsigned index, const dtype * const arg) {
@@ -202,27 +202,27 @@ class UCL_Kernel {
       CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
       #endif
     else
-      assert(0==1); // Must add kernel parameters in sequential order 
+      assert(0==1); // Must add kernel parameters in sequential order
   }
- 
+
   /// Set a geryon container as a kernel argument.
   template <class numtyp>
-  inline void set_arg(const UCL_D_Vec<numtyp> * const arg) 
+  inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
     { set_arg(&arg->begin()); }
 
   /// Set a geryon container as a kernel argument.
   template <class numtyp>
-  inline void set_arg(const UCL_D_Mat<numtyp> * const arg) 
+  inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
     { set_arg(&arg->begin()); }
 
   /// Set a geryon container as a kernel argument.
   template <class hosttype, class devtype>
-  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg) 
+  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
     { set_arg(&arg->device.begin()); }
 
   /// Set a geryon container as a kernel argument.
   template <class hosttype, class devtype>
-  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg) 
+  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
     { set_arg(&arg->device.begin()); }
 
   /// Add a kernel argument.
@@ -257,37 +257,37 @@ class UCL_Kernel {
 
   /// Add a geryon container as a kernel argument.
   template <class numtyp>
-  inline void add_arg(const UCL_D_Vec<numtyp> * const arg) 
+  inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
     { add_arg(&arg->begin()); }
 
   /// Add a geryon container as a kernel argument.
   template <class numtyp>
-  inline void add_arg(const UCL_D_Mat<numtyp> * const arg) 
+  inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
     { add_arg(&arg->begin()); }
 
   /// Add a geryon container as a kernel argument.
   template <class hosttype, class devtype>
-  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg) 
+  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
     { add_arg(&arg->device.begin()); }
 
   /// Add a geryon container as a kernel argument.
   template <class hosttype, class devtype>
-  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg) 
+  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
     { add_arg(&arg->device.begin()); }
 
   /// Set the number of thread blocks and the number of threads in each block
   /** \note This should be called before any arguments have been added
       \note The default command queue is used for the kernel execution **/
-  inline void set_size(const size_t num_blocks, const size_t block_size) { 
-    _dimensions=1; 
-    _num_blocks[0]=num_blocks; 
+  inline void set_size(const size_t num_blocks, const size_t block_size) {
+    _dimensions=1;
+    _num_blocks[0]=num_blocks;
     _num_blocks[1]=1;
     _num_blocks[2]=1;
     #if CUDA_VERSION >= 4000
     _block_size[0]=block_size;
     _block_size[1]=1;
     _block_size[2]=1;
-    #else    
+    #else
     CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
     #endif
   }
@@ -303,43 +303,43 @@ class UCL_Kernel {
   /** \note This should be called before any arguments have been added
       \note The default command queue is used for the kernel execution **/
   inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
-                       const size_t block_size_x, const size_t block_size_y) { 
-    _dimensions=2; 
-    _num_blocks[0]=num_blocks_x; 
-    _num_blocks[1]=num_blocks_y; 
+                       const size_t block_size_x, const size_t block_size_y) {
+    _dimensions=2;
+    _num_blocks[0]=num_blocks_x;
+    _num_blocks[1]=num_blocks_y;
     _num_blocks[2]=1;
     #if CUDA_VERSION >= 4000
     _block_size[0]=block_size_x;
     _block_size[1]=block_size_y;
     _block_size[2]=1;
-    #else    
+    #else
     CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
     #endif
   }
-  
+
   /// Set the number of thread blocks and the number of threads in each block
   /** \note This should be called before any arguments have been added
       \note The default command queue for the kernel is changed to cq **/
   inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                        const size_t block_size_x, const size_t block_size_y,
-                       command_queue &cq) 
+                       command_queue &cq)
     {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
 
   /// Set the number of thread blocks and the number of threads in each block
   /** \note This should be called before any arguments have been added
       \note The default command queue is used for the kernel execution **/
   inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
-                       const size_t block_size_x, 
+                       const size_t block_size_x,
                        const size_t block_size_y, const size_t block_size_z) {
-    _dimensions=2; 
-    _num_blocks[0]=num_blocks_x; 
-    _num_blocks[1]=num_blocks_y; 
-    _num_blocks[2]=1; 
+    _dimensions=2;
+    _num_blocks[0]=num_blocks_x;
+    _num_blocks[1]=num_blocks_y;
+    _num_blocks[2]=1;
     #if CUDA_VERSION >= 4000
     _block_size[0]=block_size_x;
     _block_size[1]=block_size_y;
     _block_size[2]=block_size_z;
-    #else    
+    #else
     CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
                                      block_size_z));
     #endif
@@ -352,10 +352,10 @@ class UCL_Kernel {
                        const size_t block_size_x, const size_t block_size_y,
                        const size_t block_size_z, command_queue &cq) {
     _cq=cq;
-    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, 
+    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
              block_size_z);
   }
-  
+
   /// Run the kernel in the default command queue
   inline void run() {
     #if CUDA_VERSION >= 4000
@@ -367,12 +367,12 @@ class UCL_Kernel {
     CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
     #endif
   }
-  
+
   /// Clear any arguments associated with the kernel
-  inline void clear_args() { 
-    _num_args=0; 
+  inline void clear_args() {
+    _num_args=0;
     #if CUDA_VERSION < 4000
-    _offsets.clear(); 
+    _offsets.clear();
     _param_size=0;
     #endif
   }
@@ -390,7 +390,7 @@ class UCL_Kernel {
   unsigned _num_blocks[3];
   unsigned _num_args;
   friend class UCL_Texture;
-  
+
   #if CUDA_VERSION >= 4000
   unsigned _block_size[3];
   void * _kernel_args[UCL_MAX_KERNEL_ARGS];
diff --git a/lib/gpu/geryon/nvd_mat.h b/lib/gpu/geryon/nvd_mat.h
index 51cfe1d56f..042e2978c3 100644
--- a/lib/gpu/geryon/nvd_mat.h
+++ b/lib/gpu/geryon/nvd_mat.h
@@ -17,12 +17,12 @@
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
 /*! \file */
-   
+
 #ifndef NVD_MAT_H
 #define NVD_MAT_H
 
@@ -52,6 +52,6 @@ namespace ucl_cudadr {
 #include "ucl_print.h"
 #undef UCL_PRINT_ALLOW
 
-} // namespace ucl_cudadr 
+} // namespace ucl_cudadr
 
 #endif
diff --git a/lib/gpu/geryon/nvd_memory.h b/lib/gpu/geryon/nvd_memory.h
index 5f7b98ba5c..0484e33de6 100644
--- a/lib/gpu/geryon/nvd_memory.h
+++ b/lib/gpu/geryon/nvd_memory.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -46,7 +46,7 @@ typedef CUdeviceptr device_ptr;
 // - HOST MEMORY ALLOCATION ROUTINES
 // --------------------------------------------------------------------------
 template <class mat_type, class copy_type>
-inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,  
+inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
                        const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
   CUresult err=CUDA_SUCCESS;
   if (kind==UCL_NOT_PINNED)
@@ -62,7 +62,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
 }
 
 template <class mat_type>
-inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,  
+inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
                        const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
   CUresult err=CUDA_SUCCESS;
   if (kind==UCL_NOT_PINNED)
@@ -95,7 +95,7 @@ inline int _host_resize(mat_type &mat, const size_t n) {
     *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
   else if (mat.kind()==UCL_WRITE_ONLY)
     err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
-  else  
+  else
     err=cuMemAllocHost((void **)mat.host_ptr(),n);
   if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
     return UCL_MEMORY_ERROR;
@@ -130,30 +130,30 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
                          const size_t cols, size_t &pitch,
                          const enum UCL_MEMOPT kind) {
   CUresult err;
-  CUDA_INT_TYPE upitch;                        
+  CUDA_INT_TYPE upitch;
   err=cuMemAllocPitch(&mat.cbegin(),&upitch,
                       cols*sizeof(typename mat_type::data_type),rows,16);
-  pitch=static_cast<size_t>(upitch);                               
+  pitch=static_cast<size_t>(upitch);
   if (err!=CUDA_SUCCESS)
     return UCL_MEMORY_ERROR;
   mat.cq()=cm.cq();
   return UCL_SUCCESS;
-}    
+}
 
 template <class mat_type, class copy_type>
 inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
                          const size_t cols, size_t &pitch,
                          const enum UCL_MEMOPT kind) {
   CUresult err;
-  unsigned upitch;                        
+  unsigned upitch;
   err=cuMemAllocPitch(&mat.cbegin(),&upitch,
                       cols*sizeof(typename mat_type::data_type),rows,16);
-  pitch=static_cast<size_t>(upitch);                               
+  pitch=static_cast<size_t>(upitch);
   if (err!=CUDA_SUCCESS)
     return UCL_MEMORY_ERROR;
   mat.cq()=d.cq();
   return UCL_SUCCESS;
-}    
+}
 
 template <class mat_type>
 inline void _device_free(mat_type &mat) {
@@ -175,33 +175,33 @@ inline int _device_resize(mat_type &mat, const size_t rows,
                           const size_t cols, size_t &pitch) {
   _device_free(mat);
   CUresult err;
-  CUDA_INT_TYPE upitch;                        
+  CUDA_INT_TYPE upitch;
   err=cuMemAllocPitch(&mat.cbegin(),&upitch,
                       cols*sizeof(typename mat_type::data_type),rows,16);
-  pitch=static_cast<size_t>(upitch);                               
+  pitch=static_cast<size_t>(upitch);
   if (err!=CUDA_SUCCESS)
     return UCL_MEMORY_ERROR;
   return UCL_SUCCESS;
-}    
+}
 
-inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) { 
+inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) {
   *ptr=in;
 }
 
 template <class numtyp>
-inline void _device_view(CUdeviceptr *ptr, numtyp *in) { 
-  *ptr=0; 
+inline void _device_view(CUdeviceptr *ptr, numtyp *in) {
+  *ptr=0;
 }
 
-inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in, 
-                         const size_t offset, const size_t numsize) { 
+inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in,
+                         const size_t offset, const size_t numsize) {
   *ptr=in+offset*numsize;
 }
 
 template <class numtyp>
 inline void _device_view(CUdeviceptr *ptr, numtyp *in,
-                         const size_t offset, const size_t numsize) { 
-  *ptr=0; 
+                         const size_t offset, const size_t numsize) {
+  *ptr=0;
 }
 
 // --------------------------------------------------------------------------
@@ -211,13 +211,13 @@ template <class mat_type, class copy_type>
 inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows,
                                 const size_t cols) {
   assert(0==1);
-}    
+}
 
 template <class mat_type, class copy_type>
 inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
                                 const size_t cols) {
   assert(0==1);
-}    
+}
 
 template <class mat_type>
 inline void _device_image_free(mat_type &mat) {
@@ -245,7 +245,7 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
 // - HELPER FUNCTIONS FOR MEMCPY ROUTINES
 // --------------------------------------------------------------------------
 
-inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch, 
+inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch,
                             const size_t spitch, const size_t cols,
                             const size_t rows) {
   ins.srcXInBytes=0;
@@ -257,13 +257,13 @@ inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch,
   ins.WidthInBytes=cols;
   ins.Height=rows;
 }
-                            
+
 template <int mem> struct _nvd_set_2D_mem;
-template <> struct _nvd_set_2D_mem<1> 
+template <> struct _nvd_set_2D_mem<1>
   { static CUmemorytype a() { return CU_MEMORYTYPE_HOST; } };
-template <> struct _nvd_set_2D_mem<2> 
+template <> struct _nvd_set_2D_mem<2>
   { static CUmemorytype a() { return CU_MEMORYTYPE_ARRAY; } };
-template <int mem> struct _nvd_set_2D_mem 
+template <int mem> struct _nvd_set_2D_mem
   { static CUmemorytype a() { return CU_MEMORYTYPE_DEVICE; } };
 
 
@@ -285,7 +285,7 @@ template<> struct _ucl_memcpy<2,2> {
     assert(0==1);
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows) {
     CUDA_MEMCPY2D ins;
@@ -297,7 +297,7 @@ template<> struct _ucl_memcpy<2,2> {
     CU_SAFE_CALL(cuMemcpy2D(&ins));
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows, CUstream &cq) {
     CUDA_MEMCPY2D ins;
@@ -322,7 +322,7 @@ template<> struct _ucl_memcpy<2,0> {
     assert(0==1);
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows) {
     CUDA_MEMCPY2D ins;
@@ -334,7 +334,7 @@ template<> struct _ucl_memcpy<2,0> {
     CU_SAFE_CALL(cuMemcpy2D(&ins));
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows, CUstream &cq) {
     CUDA_MEMCPY2D ins;
@@ -359,7 +359,7 @@ template<> struct _ucl_memcpy<2,1> {
     assert(0==1);
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows) {
     CUDA_MEMCPY2D ins;
@@ -371,7 +371,7 @@ template<> struct _ucl_memcpy<2,1> {
     CU_SAFE_CALL(cuMemcpy2D(&ins));
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows, CUstream &cq) {
     CUDA_MEMCPY2D ins;
@@ -396,7 +396,7 @@ template<> struct _ucl_memcpy<0,2> {
     assert(0==1);
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows) {
     CUDA_MEMCPY2D ins;
@@ -408,7 +408,7 @@ template<> struct _ucl_memcpy<0,2> {
     CU_SAFE_CALL(cuMemcpy2D(&ins));
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows, CUstream &cq) {
     CUDA_MEMCPY2D ins;
@@ -433,7 +433,7 @@ template<> struct _ucl_memcpy<1,2> {
     assert(0==1);
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows) {
     CUDA_MEMCPY2D ins;
@@ -445,7 +445,7 @@ template<> struct _ucl_memcpy<1,2> {
     CU_SAFE_CALL(cuMemcpy2D(&ins));
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows, CUstream &cq) {
     CUDA_MEMCPY2D ins;
@@ -470,7 +470,7 @@ template <> struct _ucl_memcpy<1,0> {
     CU_SAFE_CALL(cuMemcpyDtoHAsync(dst.begin(),src.cbegin(),n,cq));
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows) {
     CUDA_MEMCPY2D ins;
@@ -482,7 +482,7 @@ template <> struct _ucl_memcpy<1,0> {
     CU_SAFE_CALL(cuMemcpy2D(&ins));
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows, CUstream &cq) {
     CUDA_MEMCPY2D ins;
@@ -507,7 +507,7 @@ template <> struct _ucl_memcpy<0,1> {
     CU_SAFE_CALL(cuMemcpyHtoDAsync(dst.cbegin(),src.begin(),n,cq));
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows) {
     CUDA_MEMCPY2D ins;
@@ -519,7 +519,7 @@ template <> struct _ucl_memcpy<0,1> {
     CU_SAFE_CALL(cuMemcpy2D(&ins));
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows, CUstream &cq) {
     CUDA_MEMCPY2D ins;
@@ -542,7 +542,7 @@ template <> struct _ucl_memcpy<1,1> {
                         CUstream &cq)
     { memcpy(dst.begin(),src.begin(),n); }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows) {
     CUDA_MEMCPY2D ins;
@@ -554,7 +554,7 @@ template <> struct _ucl_memcpy<1,1> {
     CU_SAFE_CALL(cuMemcpy2D(&ins));
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows, CUstream &cq) {
     CUDA_MEMCPY2D ins;
@@ -579,18 +579,18 @@ template <int mem1, int mem2> struct _ucl_memcpy {
     CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin(),src.cbegin(),n,cq));
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows) {
     if (p1::PADDED==0 || p2::PADDED==0) {
       size_t src_offset=0, dst_offset=0;
-      for (size_t i=0; i<rows; i++) {                       
+      for (size_t i=0; i<rows; i++) {
         CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset,
                                   src.cbegin()+src_offset,cols));
         src_offset+=spitch;
         dst_offset+=dpitch;
       }
-    } else {                                       
+    } else {
       CUDA_MEMCPY2D ins;
       _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
       ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
@@ -601,12 +601,12 @@ template <int mem1, int mem2> struct _ucl_memcpy {
     }
   }
   template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                             const size_t spitch, const size_t cols,
                             const size_t rows, CUstream &cq) {
     if (p1::PADDED==0 || p2::PADDED==0) {
       size_t src_offset=0, dst_offset=0;
-      for (size_t i=0; i<rows; i++) {                       
+      for (size_t i=0; i<rows; i++) {
         CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin()+dst_offset,
                                        src.cbegin()+src_offset,cols,cq));
         src_offset+=spitch;
@@ -636,22 +636,22 @@ inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n,
 }
 
 template<class mat1, class mat2>
-inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, 
-                       const size_t spitch, const size_t cols, 
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+                       const size_t spitch, const size_t cols,
                        const size_t rows) {
   _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
                                                  rows);
 }
 
 template<class mat1, class mat2>
-inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, 
-                       const size_t spitch, const size_t cols, 
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+                       const size_t spitch, const size_t cols,
                        const size_t rows,CUstream &cq) {
   _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
                                                  rows,cq);
 }
 
-} // namespace ucl_cudart 
+} // namespace ucl_cudart
 
 #endif
 
diff --git a/lib/gpu/geryon/nvd_texture.h b/lib/gpu/geryon/nvd_texture.h
index 07650263a5..965595a448 100644
--- a/lib/gpu/geryon/nvd_texture.h
+++ b/lib/gpu/geryon/nvd_texture.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -28,7 +28,7 @@
 #include "nvd_mat.h"
 
 namespace ucl_cudadr {
-    
+
 /// Class storing a texture reference
 class UCL_Texture {
  public:
@@ -38,39 +38,39 @@ class UCL_Texture {
   inline UCL_Texture(UCL_Program &prog, const char *texture_name)
     { get_texture(prog,texture_name); }
   /// Set the texture reference for this object
-  inline void get_texture(UCL_Program &prog, const char *texture_name)  
+  inline void get_texture(UCL_Program &prog, const char *texture_name)
     { CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); }
 
   /// Bind a float array where each fetch grabs a vector of length numel
   template<class numtyp>
-  inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel) 
+  inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel)
     { _bind_float(vec,numel); }
 
   /// Bind a float array where each fetch grabs a vector of length numel
   template<class numtyp>
-  inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel) 
+  inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel)
     { _bind_float(vec,numel); }
 
   /// Bind a float array where each fetch grabs a vector of length numel
   template<class numtyp, class devtyp>
-  inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel) 
+  inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel)
     { _bind_float(vec.device,numel); }
 
   /// Bind a float array where each fetch grabs a vector of length numel
   template<class numtyp, class devtyp>
-  inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel) 
+  inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel)
     { _bind_float(vec.device,numel); }
 
   /// Unbind the texture reference from the memory allocation
   inline void unbind() { }
 
-  /// Make a texture reference available to kernel  
-  inline void allow(UCL_Kernel &kernel) { 
+  /// Make a texture reference available to kernel
+  inline void allow(UCL_Kernel &kernel) {
     #if CUDA_VERSION < 4000
-    CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex)); 
+    CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
     #endif
   }
-  
+
  private:
   CUtexref _tex;
   friend class UCL_Kernel;
@@ -80,7 +80,7 @@ class UCL_Texture {
     #ifdef UCL_DEBUG
     assert(numel!=0 && numel<5);
     #endif
-    CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(), 
+    CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(),
                  vec.numel()*vec.element_size()));
     if (vec.element_size()==sizeof(float))
       CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel));
diff --git a/lib/gpu/geryon/nvd_timer.h b/lib/gpu/geryon/nvd_timer.h
index 4c3e993e0d..aefbaea0c3 100644
--- a/lib/gpu/geryon/nvd_timer.h
+++ b/lib/gpu/geryon/nvd_timer.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -41,7 +41,7 @@ class UCL_Timer {
   /// Clear any data associated with timer
   /** \note init() must be called to reuse timer after a clear() **/
   inline void clear() {
-    if (_initialized) { 
+    if (_initialized) {
       CU_DESTRUCT_CALL(cuEventDestroy(start_event));
       CU_DESTRUCT_CALL(cuEventDestroy(stop_event));
       _initialized=false;
@@ -63,16 +63,16 @@ class UCL_Timer {
 
   /// Start timing on command queue
   inline void start() { CU_SAFE_CALL(cuEventRecord(start_event,_cq)); }
-  
+
   /// Stop timing on command queue
   inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
-  
+
   /// Block until the start event has been reached on device
-  inline void sync_start() 
+  inline void sync_start()
     { CU_SAFE_CALL(cuEventSynchronize(start_event)); }
 
   /// Block until the stop event has been reached on device
-  inline void sync_stop() 
+  inline void sync_stop()
     { CU_SAFE_CALL(cuEventSynchronize(stop_event)); }
 
   /// Set the time elapsed to zero (not the total_time)
@@ -80,29 +80,29 @@ class UCL_Timer {
     CU_SAFE_CALL(cuEventRecord(start_event,_cq));
     CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
   }
-  
+
   /// Set the total time to zero
   inline void zero_total() { _total_time=0.0; }
-  
+
   /// Add time from previous start and stop to total
   /** Forces synchronization **/
-  inline double add_to_total() 
+  inline double add_to_total()
     { double t=time(); _total_time+=t; return t/1000.0; }
-  
+
   /// Add a user specified time to the total (ms)
   inline void add_time_to_total(const double t) { _total_time+=t; }
-  
+
   /// Return the time (ms) of last start to stop - Forces synchronization
-  inline double time() { 
+  inline double time() {
     float timer;
     CU_SAFE_CALL(cuEventSynchronize(stop_event));
     CU_SAFE_CALL( cuEventElapsedTime(&timer,start_event,stop_event) );
-    return timer; 
+    return timer;
   }
-  
+
   /// Return the time (s) of last start to stop - Forces synchronization
   inline double seconds() { return time()/1000.0; }
-  
+
   /// Return the total time in ms
   inline double total_time() { return _total_time; }
 
diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index 8dadcf2efd..20656c8489 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -40,13 +40,13 @@
 #include "ucl_types.h"
 
 namespace ucl_opencl {
-    
+
 // --------------------------------------------------------------------------
 // - COMMAND QUEUE STUFF
 // --------------------------------------------------------------------------
-typedef cl_command_queue command_queue; 
+typedef cl_command_queue command_queue;
 typedef cl_context context_type;
-  
+
 inline void ucl_sync(cl_command_queue &cq) {
   CL_SAFE_CALL(clFinish(cq));
 }
@@ -76,19 +76,19 @@ struct OCLProperties {
 
 /// Class for looking at data parallel device properties
 /** \note Calls to change the device outside of the class results in incorrect
-  *       behavior 
+  *       behavior
   * \note There is no error checking for indexing past the number of devices **/
 class UCL_Device {
  public:
   /// Collect properties for every device on the node
    /** \note You must set the active GPU with set() before using the device **/
   inline UCL_Device();
-  
+
   inline ~UCL_Device();
 
   /// Return the number of platforms (0 if error or no platforms)
   inline int num_platforms() { return _num_platforms; }
-  
+
   /// Return a string with name and info of the current platform
   inline std::string platform_name();
 
@@ -104,38 +104,38 @@ class UCL_Device {
     * be allocated for use. clear() is called to delete any contexts and
     * associated data from previous calls to set(). **/
   inline int set(int num);
-  
+
   /// Delete any context and associated data stored from a call to set()
   inline void clear();
 
   /// Get the current device number
   inline int device_num() { return _device; }
-  
+
   /// Returns the context for the current device
   inline cl_context & context() { return _context; }
-  
+
   /// Returns the default stream for the current device
   inline command_queue & cq() { return cq(_default_cq); }
-  
+
   /// Returns the stream indexed by i
   inline command_queue & cq(const int i) { return _cq[i]; }
-  
+
   /// Set the default command queue
-  /** \param i index of the command queue (as added by push_command_queue()) 
+  /** \param i index of the command queue (as added by push_command_queue())
       If i is 0, the command queue created with device initialization is
       used **/
   inline void set_command_queue(const int i) { _default_cq=i; }
-  
+
   /// Block until all commands in the default stream have completed
   inline void sync() { sync(_default_cq); }
-  
+
   /// Block until all commands in the specified stream have completed
   inline void sync(const int i) { ucl_sync(cq(i)); }
-  
+
   /// Get the number of command queues currently available on device
-  inline int num_queues() 
+  inline int num_queues()
     { return _cq.size(); }
-  
+
   /// Add a command queue for device computations (with profiling enabled)
   inline void push_command_queue() {
     cl_int errorv;
@@ -143,7 +143,7 @@ class UCL_Device {
     _cq.back()=clCreateCommandQueue(_context,_cl_device,
                                     CL_QUEUE_PROFILING_ENABLE,&errorv);
     if (errorv!=CL_SUCCESS) {
-      std::cerr << "Could not create command queue on device: " << name() 
+      std::cerr << "Could not create command queue on device: " << name()
                 << std::endl;
       UCL_GERYON_EXIT;
     }
@@ -160,76 +160,76 @@ class UCL_Device {
   /// Get the current OpenCL device name
   inline std::string name() { return name(_device); }
   /// Get the OpenCL device name
-  inline std::string name(const int i) 
+  inline std::string name(const int i)
     { return std::string(_properties[i].name); }
 
   /// Get a string telling the type of the current device
   inline std::string device_type_name() { return device_type_name(_device); }
   /// Get a string telling the type of the device
   inline std::string device_type_name(const int i);
-  
+
   /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
   inline int device_type() { return device_type(_device); }
   /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
   inline int device_type(const int i);
-  
+
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory() { return shared_memory(_device); }
   /// Returns true if host memory is efficiently addressable from device
-  inline bool shared_memory(const int i) 
+  inline bool shared_memory(const int i)
     { return _shared_mem_device(_properties[i].device_type); }
-  
+
   /// Returns true if double precision is support for the current device
   inline bool double_precision() { return double_precision(_device); }
   /// Returns true if double precision is support for the device
-  inline bool double_precision(const int i) 
+  inline bool double_precision(const int i)
     {return _properties[i].double_precision;}
-   
+
   /// Get the number of compute units on the current device
   inline unsigned cus() { return cus(_device); }
   /// Get the number of compute units
-  inline unsigned cus(const int i) 
+  inline unsigned cus(const int i)
     { return _properties[i].compute_units; }
 
   /// Get the gigabytes of global memory in the current device
   inline double gigabytes() { return gigabytes(_device); }
   /// Get the gigabytes of global memory
-  inline double gigabytes(const int i) 
+  inline double gigabytes(const int i)
     { return static_cast<double>(_properties[i].global_mem)/1073741824; }
 
   /// Get the bytes of global memory in the current device
   inline size_t bytes() { return bytes(_device); }
   /// Get the bytes of global memory
   inline size_t bytes(const int i) { return _properties[i].global_mem; }
-  
+
   /// Return the GPGPU revision number for current device
   //inline double revision() { return revision(_device); }
   /// Return the GPGPU revision number
-  //inline double revision(const int i) 
+  //inline double revision(const int i)
   //  { return //static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
-  
+
   /// Clock rate in GHz for current device
   inline double clock_rate() { return clock_rate(_device); }
   /// Clock rate in GHz
   inline double clock_rate(const int i) { return _properties[i].clock*1e-3;}
-  
+
   /// Return the address alignment in bytes
   inline int alignment() { return alignment(_device); }
   /// Return the address alignment in bytes
   inline int alignment(const int i) { return _properties[i].alignment; }
-               
+
   /// Return the timer resolution
   inline size_t timer_resolution() { return timer_resolution(_device); }
   /// Return the timer resolution
-  inline size_t timer_resolution(const int i) 
+  inline size_t timer_resolution(const int i)
     { return _properties[i].timer_resolution; }
-    
+
   /// Get the maximum number of threads per block
   inline size_t group_size() { return group_size(_device); }
   /// Get the maximum number of threads per block
-  inline size_t group_size(const int i) 
+  inline size_t group_size(const int i)
     { return _properties[i].work_group_size; }
-  
+
   /// Return the maximum memory pitch in bytes for current device
   inline size_t max_pitch() { return max_pitch(_device); }
   /// Return the maximum memory pitch in bytes
@@ -254,7 +254,7 @@ class UCL_Device {
     { return fission_by_counts(_device); }
   /// True if splitting device into subdevices by specified counts supported
   inline bool fission_by_counts(const int i)
-    { return _properties[i].partition_counts; }    
+    { return _properties[i].partition_counts; }
   /// True if splitting device into subdevices by affinity domains supported
   inline bool fission_by_affinity()
     { return fission_by_affinity(_device); }
@@ -271,10 +271,10 @@ class UCL_Device {
 
   /// List all devices along with all properties
   inline void print_all(std::ostream &out);
-  
+
   /// Return the OpenCL type for the device
   inline cl_device_id & cl_device() { return _cl_device; }
- 
+
  private:
   int _num_platforms;          // Number of platforms
   int _platform;               // UCL_Device ID for current platform
@@ -287,7 +287,7 @@ class UCL_Device {
   std::vector<cl_device_id> _cl_devices;  // OpenCL IDs for all devices
   int _num_devices;                       // Number of devices
   std::vector<OCLProperties> _properties; // Properties for each device
-  
+
   inline void add_properties(cl_device_id);
   inline int create_context();
   int _default_cq;
@@ -300,7 +300,7 @@ UCL_Device::UCL_Device() {
   // --- Get Number of Platforms
   cl_uint nplatforms;
   cl_int errorv=clGetPlatformIDs(20,_cl_platforms,&nplatforms);
-  
+
   if (errorv!=CL_SUCCESS) {
     _num_platforms=0;
     return;
@@ -328,18 +328,18 @@ void UCL_Device::clear() {
 int UCL_Device::set_platform(int pid) {
   clear();
   cl_int errorv;
-  
+
   _cl_device=0;
   _device=-1;
   _num_devices=0;
   _default_cq=0;
- 
+
   #ifdef UCL_DEBUG
   assert(pid<num_platforms());
   #endif
   _platform=pid;
   _cl_platform=_cl_platforms[_platform];
-  
+
   // --- Get Number of Devices
   cl_uint n;
   errorv=clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,0,NULL,&n);
@@ -351,7 +351,7 @@ int UCL_Device::set_platform(int pid) {
   cl_device_id device_list[_num_devices];
   CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
                               &n));
-  
+
   // --- Store properties for each device
   for (int i=0; i<_num_devices; i++) {
     _cl_devices.push_back(device_list[i]);
@@ -385,7 +385,7 @@ void UCL_Device::add_properties(cl_device_id device_list) {
   OCLProperties op;
   char buffer[1024];
   cl_bool ans_bool;
-    
+
   CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL));
   op.name=buffer;
   CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_GLOBAL_MEM_SIZE,
@@ -409,8 +409,8 @@ void UCL_Device::add_properties(cl_device_id device_list) {
                                NULL));
   CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MEM_BASE_ADDR_ALIGN,
                                sizeof(cl_uint),&op.alignment,NULL));
-  op.alignment/=8;                               
-  
+  op.alignment/=8;
+
   // Determine if double precision is supported
   cl_uint double_width;
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
@@ -420,11 +420,11 @@ void UCL_Device::add_properties(cl_device_id device_list) {
     op.double_precision=false;
   else
     op.double_precision=true;
-  
+
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                CL_DEVICE_PROFILING_TIMER_RESOLUTION,
                                sizeof(size_t),&op.timer_resolution,NULL));
-  
+
 
   op.ecc_support=false;
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
@@ -432,7 +432,7 @@ void UCL_Device::add_properties(cl_device_id device_list) {
                                sizeof(ans_bool),&ans_bool,NULL));
   if (ans_bool==CL_TRUE)
     op.ecc_support=true;
-  
+
   op.c_version="";
   op.partition_equal=false;
   op.partition_counts=false;
@@ -458,30 +458,30 @@ void UCL_Device::add_properties(cl_device_id device_list) {
     else if (pinfo[i]==CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN)
       op.partition_affinity=true;
   }
-  
+
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
                                sizeof(cl_uint),&op.max_sub_devices,NULL));
   #endif
-  
+
   _properties.push_back(op);
 }
 
 std::string UCL_Device::platform_name() {
   char info[1024];
-  
+
   CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info,
                                  NULL));
   std::string ans=std::string(info)+' ';
-  
+
   CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_NAME,1024,info,
                                  NULL));
   ans+=std::string(info)+' ';
-  
+
   CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VERSION,1024,info,
                NULL));
   ans+=std::string(info);
-  
+
   return ans;
 }
 
@@ -512,7 +512,7 @@ int UCL_Device::device_type(const int i) {
 // Set the CUDA device to the specified device number
 int UCL_Device::set(int num) {
   clear();
-  
+
   cl_device_id device_list[_num_devices];
   cl_uint n;
   CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
@@ -557,7 +557,7 @@ void UCL_Device::print_all(std::ostream &out) {
         << _properties[i].work_item_size[1] << " x "
         << _properties[i].work_item_size[2] << std::endl;
     //out << "  Maximum sizes of each dimension of a grid:     "
-    //    << _properties[i].maxGridSize[0] << " x " 
+    //    << _properties[i].maxGridSize[0] << " x "
     //    << _properties[i].maxGridSize[1] << " x "
     //    << _properties[i].maxGridSize[2] << std::endl;
     //out << "  Maximum memory pitch:                          "
diff --git a/lib/gpu/geryon/ocl_kernel.h b/lib/gpu/geryon/ocl_kernel.h
index e55b6034a6..e4c37b2a77 100644
--- a/lib/gpu/geryon/ocl_kernel.h
+++ b/lib/gpu/geryon/ocl_kernel.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -28,7 +28,7 @@
 #include <fstream>
 
 namespace ucl_opencl {
-    
+
 class UCL_Texture;
 template <class numtyp> class UCL_D_Vec;
 template <class numtyp> class UCL_D_Mat;
@@ -41,10 +41,10 @@ class UCL_Program {
  public:
   inline UCL_Program() : _init_done(false) {}
   inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); }
-  inline UCL_Program(UCL_Device &device, const void *program, 
-                     const char *flags="", std::string *log=NULL) : 
-      _init_done(false) { 
-    init(device); 
+  inline UCL_Program(UCL_Device &device, const void *program,
+                     const char *flags="", std::string *log=NULL) :
+      _init_done(false) {
+    init(device);
     load_string(program,flags,log);
   }
 
@@ -56,7 +56,7 @@ class UCL_Program {
     _device=device.cl_device();
     _context=device.context();
     _cq=device.cq();
-    CL_SAFE_CALL(clRetainContext(_context)); 
+    CL_SAFE_CALL(clRetainContext(_context));
     CL_SAFE_CALL(clRetainCommandQueue(_cq));
     _init_done=true;
   }
@@ -65,7 +65,7 @@ class UCL_Program {
   /** \note Must call init() after each clear **/
   inline void clear() {
     if (_init_done) {
-      CL_DESTRUCT_CALL(clReleaseProgram(_program)); 
+      CL_DESTRUCT_CALL(clReleaseProgram(_program));
       CL_DESTRUCT_CALL(clReleaseContext(_context));
       CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
       _init_done=false;
@@ -77,20 +77,20 @@ class UCL_Program {
                   std::string *log=NULL) {
     std::ifstream in(filename);
     if (!in || in.is_open()==false) {
-      #ifndef UCL_NO_EXIT 
-      std::cerr << "UCL Error: Could not open kernel file: " 
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not open kernel file: "
                 << filename << std::endl;
       UCL_GERYON_EXIT;
       #endif
       return UCL_FILE_NOT_FOUND;
     }
-  
+
     std::string program((std::istreambuf_iterator<char>(in)),
                         std::istreambuf_iterator<char>());
     in.close();
     return load_string(program.c_str(),flags,log);
   }
-  
+
   /// Load a program from a string and compile with flags
   inline int load_string(const void *program, const char *flags="",
                          std::string *log=NULL) {
@@ -103,23 +103,23 @@ class UCL_Program {
       CL_CHECK_ERR(error_flag);
     cl_build_status build_status;
     CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,
-                                       CL_PROGRAM_BUILD_STATUS, 
+                                       CL_PROGRAM_BUILD_STATUS,
                                        sizeof(cl_build_status),&build_status,
                                        NULL));
-                                       
+
     if (build_status != CL_SUCCESS || log!=NULL) {
       size_t ms;
-      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0, 
+      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0,
                                          NULL, &ms));
-      char build_log[ms];                                     
+      char build_log[ms];
       CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms,
                                          build_log, NULL));
-                                         
+
       if (log!=NULL)
         *log=std::string(build_log);
-                                                 
+
       if (build_status != CL_SUCCESS) {
-        #ifndef UCL_NO_EXIT                                                 
+        #ifndef UCL_NO_EXIT
         std::cerr << std::endl
                   << "----------------------------------------------------------\n"
                   << " UCL Error: Error compiling OpenCL Program ("
@@ -130,10 +130,10 @@ class UCL_Program {
         return UCL_COMPILE_ERROR;
       }
     }
-    
+
     return UCL_SUCCESS;
   }
-   
+
   /// Return the default command queue/stream associated with this data
   inline command_queue & cq() { return _cq; }
   /// Change the default command queue associated with matrix
@@ -143,7 +143,7 @@ class UCL_Program {
  private:
   bool _init_done;
   cl_program _program;
-  cl_device_id _device; 
+  cl_device_id _device;
   cl_context _context;
   cl_command_queue _cq;
 };
@@ -153,7 +153,7 @@ class UCL_Kernel {
  public:
   UCL_Kernel() : _dimensions(1), _function_set(false), _num_args(0)
     {  _block_size[0]=0; _num_blocks[0]=0; }
-  
+
   inline UCL_Kernel(UCL_Program &program, const char *function) :
     _dimensions(1), _function_set(false), _num_args(0)
     {  _block_size[0]=0; _num_blocks[0]=0; set_function(program,function); }
@@ -178,48 +178,48 @@ class UCL_Kernel {
   /** If not a device pointer, this must be repeated each time the argument
     * changes **/
   template <class dtype>
-  inline void set_arg(const cl_uint index, const dtype * const arg) { 
-    CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg)); 
+  inline void set_arg(const cl_uint index, const dtype * const arg) {
+    CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg));
     if (index>_num_args) {
       _num_args=index;
       #ifdef UCL_DEBUG
       if (_num_args>_kernel_info_nargs) {
-        std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: " 
+        std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
                   << _kernel_info_name << std::endl;
         assert(0==1);
       }
       #endif
     }
   }
- 
+
   /// Set a geryon container as a kernel argument.
   template <class numtyp>
-  inline void set_arg(const UCL_D_Vec<numtyp> * const arg) 
+  inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
     { set_arg(&arg->begin()); }
 
   /// Set a geryon container as a kernel argument.
   template <class numtyp>
-  inline void set_arg(const UCL_D_Mat<numtyp> * const arg) 
+  inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
     { set_arg(&arg->begin()); }
 
   /// Set a geryon container as a kernel argument.
   template <class hosttype, class devtype>
-  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg) 
+  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
     { set_arg(&arg->device.begin()); }
 
   /// Set a geryon container as a kernel argument.
   template <class hosttype, class devtype>
-  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg) 
+  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
     { set_arg(&arg->device.begin()); }
 
   /// Add a kernel argument.
   template <class dtype>
   inline void add_arg(const dtype * const arg) {
-    CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg)); 
-    _num_args++; 
+    CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg));
+    _num_args++;
     #ifdef UCL_DEBUG
     if (_num_args>_kernel_info_nargs) {
-      std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: " 
+      std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
                 << _kernel_info_name << std::endl;
       assert(0==1);
     }
@@ -228,31 +228,31 @@ class UCL_Kernel {
 
   /// Add a geryon container as a kernel argument.
   template <class numtyp>
-  inline void add_arg(const UCL_D_Vec<numtyp> * const arg) 
+  inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
     { add_arg(&arg->begin()); }
 
   /// Add a geryon container as a kernel argument.
   template <class numtyp>
-  inline void add_arg(const UCL_D_Mat<numtyp> * const arg) 
+  inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
     { add_arg(&arg->begin()); }
 
   /// Add a geryon container as a kernel argument.
   template <class hosttype, class devtype>
-  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg) 
+  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
     { add_arg(&arg->device.begin()); }
 
   /// Add a geryon container as a kernel argument.
   template <class hosttype, class devtype>
-  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg) 
+  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
     { add_arg(&arg->device.begin()); }
 
   /// Set the number of thread blocks and the number of threads in each block
   /** \note This should be called before any arguments have been added
       \note The default command queue is used for the kernel execution **/
-  inline void set_size(const size_t num_blocks, const size_t block_size) { 
-    _dimensions=1; 
-    _num_blocks[0]=num_blocks*block_size; 
-    _block_size[0]=block_size; 
+  inline void set_size(const size_t num_blocks, const size_t block_size) {
+    _dimensions=1;
+    _num_blocks[0]=num_blocks*block_size;
+    _block_size[0]=block_size;
   }
 
   /// Set the number of thread blocks and the number of threads in each block
@@ -266,36 +266,36 @@ class UCL_Kernel {
   /** \note This should be called before any arguments have been added
       \note The default command queue is used for the kernel execution **/
   inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
-                       const size_t block_size_x, const size_t block_size_y) { 
-    _dimensions=2; 
-    _num_blocks[0]=num_blocks_x*block_size_x; 
-    _block_size[0]=block_size_x; 
-    _num_blocks[1]=num_blocks_y*block_size_y; 
-    _block_size[1]=block_size_y; 
+                       const size_t block_size_x, const size_t block_size_y) {
+    _dimensions=2;
+    _num_blocks[0]=num_blocks_x*block_size_x;
+    _block_size[0]=block_size_x;
+    _num_blocks[1]=num_blocks_y*block_size_y;
+    _block_size[1]=block_size_y;
   }
-  
+
   /// Set the number of thread blocks and the number of threads in each block
   /** \note This should be called before any arguments have been added
       \note The default command queue for the kernel is changed to cq **/
   inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                        const size_t block_size_x, const size_t block_size_y,
-                       command_queue &cq) 
+                       command_queue &cq)
     {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}
 
   /// Set the number of thread blocks and the number of threads in each block
   /** \note This should be called before any arguments have been added
       \note The default command queue is used for the kernel execution **/
   inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
-                       const size_t block_size_x, 
+                       const size_t block_size_x,
                        const size_t block_size_y, const size_t block_size_z) {
-    _dimensions=3; 
+    _dimensions=3;
     const size_t num_blocks_z=1;
-    _num_blocks[0]=num_blocks_x*block_size_x; 
-    _block_size[0]=block_size_x; 
-    _num_blocks[1]=num_blocks_y*block_size_y; 
-    _block_size[1]=block_size_y; 
-    _num_blocks[2]=num_blocks_z*block_size_z; 
-    _block_size[2]=block_size_z; 
+    _num_blocks[0]=num_blocks_x*block_size_x;
+    _block_size[0]=block_size_x;
+    _num_blocks[1]=num_blocks_y*block_size_y;
+    _block_size[1]=block_size_y;
+    _num_blocks[2]=num_blocks_z*block_size_z;
+    _block_size[2]=block_size_z;
   }
 
   /// Set the number of thread blocks and the number of threads in each block
@@ -305,13 +305,13 @@ class UCL_Kernel {
                        const size_t block_size_x, const size_t block_size_y,
                        const size_t block_size_z, command_queue &cq) {
     _cq=cq;
-    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, 
+    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
              block_size_z);
   }
-  
+
   /// Run the kernel in the default command queue
   inline void run();
-  
+
   /// Clear any arguments associated with the kernel
   inline void clear_args() { _num_args=0; }
 
@@ -320,7 +320,7 @@ class UCL_Kernel {
   /// Change the default command queue associated with matrix
   inline void cq(command_queue &cq_in) { _cq=cq_in; }
   #include "ucl_arg_kludge.h"
-  
+
  private:
   cl_kernel _kernel;
   cl_program _program;
@@ -328,7 +328,7 @@ class UCL_Kernel {
   size_t _block_size[3];
   size_t _num_blocks[3];
   bool _function_set;
-  
+
   cl_command_queue _cq;        // The default command queue for this kernel
   unsigned _num_args;
 
@@ -348,7 +348,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
   CL_SAFE_CALL(clRetainProgram(_program));
   cl_int error_flag;
   _kernel=clCreateKernel(program._program,function,&error_flag);
-  
+
   if (error_flag!=CL_SUCCESS) {
     #ifndef UCL_NO_EXIT
     std::cerr << "UCL Error: Could not find function: " << function
@@ -357,7 +357,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
     #endif
     return UCL_FUNCTION_NOT_FOUND;
   }
-  
+
   #ifdef UCL_DEBUG
   _kernel_info_name=function;
   cl_uint nargs;
@@ -375,7 +375,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
   #endif
   #endif
 
-  return UCL_SUCCESS;                                               
+  return UCL_SUCCESS;
 }
 
 void UCL_Kernel::run() {
diff --git a/lib/gpu/geryon/ocl_mat.h b/lib/gpu/geryon/ocl_mat.h
index 2909d72a72..3135594dc3 100644
--- a/lib/gpu/geryon/ocl_mat.h
+++ b/lib/gpu/geryon/ocl_mat.h
@@ -17,12 +17,12 @@
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
 /*! \file */
-   
+
 #ifndef OCL_MAT_H
 #define OCL_MAT_H
 
@@ -54,6 +54,6 @@ namespace ucl_opencl {
 #include "ucl_print.h"
 #undef UCL_PRINT_ALLOW
 
-} // namespace ucl_cudart 
+} // namespace ucl_cudart
 
 #endif
diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h
index 7aed0a1a8c..28bb88941f 100644
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -36,10 +36,10 @@ namespace ucl_opencl {
 // --------------------------------------------------------------------------
 struct ocl_kernel_dim {
   size_t x,y,z;
-  ocl_kernel_dim(size_t _x = 1, size_t _y = 1, size_t _z = 1) : 
+  ocl_kernel_dim(size_t _x = 1, size_t _y = 1, size_t _z = 1) :
     x(_x), y(_y), z(_z) {}
   operator size_t * () { return (size_t *)this; }
-  operator const size_t * () const { return (const size_t *)this; } 
+  operator const size_t * () const { return (const size_t *)this; }
 };
 typedef ocl_kernel_dim ucl_kernel_dim;
 
@@ -53,13 +53,13 @@ typedef cl_mem device_ptr;
 // --------------------------------------------------------------------------
 
 template <class mat_type, class copy_type>
-inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,  
+inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
                        const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
   cl_int error_flag;
   cl_context context;
   CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
                                   &context,NULL));
-  
+
   cl_mem_flags buffer_perm;
   cl_map_flags map_perm;
   if (kind2==UCL_NOT_SPECIFIED) {
@@ -88,7 +88,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
       buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
     else
       buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
-    
+
     if (kind==UCL_READ_ONLY) {
       #ifdef CL_VERSION_1_2
       buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY;
@@ -102,9 +102,9 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
     } else
       map_perm=CL_MAP_READ | CL_MAP_WRITE;
   }
-    
+
   mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
-  if (error_flag != CL_SUCCESS) 
+  if (error_flag != CL_SUCCESS)
     return UCL_MEMORY_ERROR;
     *mat.host_ptr() = (typename mat_type::data_type*)
                       clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
@@ -125,7 +125,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
   CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags),
                                   &orig_flags,NULL));
   orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
-  
+
   mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
                               *mat.host_ptr(), &error_flag);
 
@@ -135,7 +135,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
 }
 
 template <class mat_type>
-inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,  
+inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
                        const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
   cl_mem_flags buffer_perm;
   cl_map_flags map_perm;
@@ -160,7 +160,7 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
 
   cl_int error_flag;
   mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,NULL,&error_flag);
-  if (error_flag != CL_SUCCESS) 
+  if (error_flag != CL_SUCCESS)
     return UCL_MEMORY_ERROR;
 
   *mat.host_ptr() = (typename mat_type::data_type*)
@@ -210,7 +210,7 @@ inline int _host_resize(mat_type &mat, const size_t n) {
     map_perm=CL_MAP_READ | CL_MAP_WRITE;
 
   mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
-  if (error_flag != CL_SUCCESS) 
+  if (error_flag != CL_SUCCESS)
     return UCL_MEMORY_ERROR;
   *mat.host_ptr() = (typename mat_type::data_type*)
                     clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
@@ -248,7 +248,7 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
   else
     assert(0==1);
   mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
-  if (error_flag != CL_SUCCESS) 
+  if (error_flag != CL_SUCCESS)
     return UCL_MEMORY_ERROR;
   mat.cq()=cm.cq();
   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
@@ -278,7 +278,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
     assert(0==1);
   mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL,
                               &error_flag);
-  if (error_flag != CL_SUCCESS) 
+  if (error_flag != CL_SUCCESS)
     return UCL_MEMORY_ERROR;
   mat.cq()=dev.cq();
   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
@@ -304,7 +304,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t rows,
   if (dev.device_type()!=UCL_CPU && cols%256!=0)
     padded_cols+=256-cols%256;
   pitch=padded_cols*sizeof(typename mat_type::data_type);
-  return _device_alloc(mat,dev,pitch*rows,kind);  
+  return _device_alloc(mat,dev,pitch*rows,kind);
 }
 
 template <class mat_type>
@@ -342,7 +342,7 @@ inline int _device_resize(mat_type &mat, const size_t n) {
   else
     assert(0==1);
   mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
-  if (error_flag != CL_SUCCESS) 
+  if (error_flag != CL_SUCCESS)
     return UCL_MEMORY_ERROR;
   return UCL_SUCCESS;
 }
@@ -380,7 +380,7 @@ inline int _device_resize(mat_type &mat, const size_t rows,
   else
     assert(0==1);
   mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
-  if (error_flag != CL_SUCCESS) 
+  if (error_flag != CL_SUCCESS)
     return UCL_MEMORY_ERROR;
   return UCL_SUCCESS;
 }
@@ -396,21 +396,21 @@ inline void _host_zero(void *ptr, const size_t n) {
 inline void _ocl_build(cl_program &program, cl_device_id &device,
                        const char* options = "") {
   clBuildProgram(program,1,&device,options,NULL,NULL);
-    
+
   cl_build_status build_status;
-  CL_SAFE_CALL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, 
+  CL_SAFE_CALL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS,
                                      sizeof(cl_build_status),&build_status,
                                      NULL));
   if (build_status == CL_SUCCESS)
     return;
-    
+
   size_t ms;
-  CL_SAFE_CALL(clGetProgramBuildInfo(program, device,CL_PROGRAM_BUILD_LOG, 0, 
+  CL_SAFE_CALL(clGetProgramBuildInfo(program, device,CL_PROGRAM_BUILD_LOG, 0,
                                      NULL, &ms));
-  char build_log[ms];                                     
+  char build_log[ms];
   CL_SAFE_CALL(clGetProgramBuildInfo(program,device,CL_PROGRAM_BUILD_LOG,ms,
                                      build_log, NULL));
-    
+
   std::cerr << std::endl
             << "----------------------------------------------------------\n"
             << " Error compiling OpenCL Program...\n"
@@ -423,13 +423,13 @@ inline void _ocl_kernel_from_source(cl_context &context, cl_device_id &device,
                                     cl_kernel &kernel, const char *function,
                                     const char *options="") {
   cl_int error_flag;
-  
+
   cl_program program=clCreateProgramWithSource(context,lines,source,
                                                NULL,&error_flag);
-  CL_CHECK_ERR(error_flag);                                               
+  CL_CHECK_ERR(error_flag);
   _ocl_build(program,device,options);
   kernel=clCreateKernel(program,function,&error_flag);
-  CL_CHECK_ERR(error_flag);                                               
+  CL_CHECK_ERR(error_flag);
 }
 
 template <class mat_type>
@@ -452,17 +452,17 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
   cl_device_id device;
   CL_SAFE_CALL(clGetContextInfo(context,CL_CONTEXT_DEVICES,
                sizeof(cl_device_id),&device,NULL));
-  
+
   const char * szero[3]={
     "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
     "__kernel void _device_zero(__global NUMTYP *a, const int offset)",
     "  { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }"
   };
-  
+
   cl_kernel kzero;
   _ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero",
                    _UCL_DATA_ID<typename mat_type::data_type>::numtyp_flag());
-  
+
   cl_int offset=mat.offset();
   CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin()));
   CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset));
@@ -486,7 +486,7 @@ template<> struct _ucl_memcpy<2,2> {
     assert(0==1);
   }
   template <class p1, class p2>
-  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                         const size_t spitch, const size_t cols,
                         const size_t rows, cl_command_queue &cq,
                         const cl_bool block,
@@ -504,7 +504,7 @@ template<> struct _ucl_memcpy<2,0> {
     assert(0==1);
   }
   template <class p1, class p2>
-  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                         const size_t spitch, const size_t cols,
                         const size_t rows, cl_command_queue &cq,
                         const cl_bool block,
@@ -522,7 +522,7 @@ template<> struct _ucl_memcpy<2,1> {
     assert(0==1);
   }
   template <class p1, class p2>
-  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                         const size_t spitch, const size_t cols,
                         const size_t rows, cl_command_queue &cq,
                         const cl_bool block,
@@ -540,7 +540,7 @@ template<> struct _ucl_memcpy<0,2> {
     assert(0==1);
   }
   template <class p1, class p2>
-  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                         const size_t spitch, const size_t cols,
                         const size_t rows, cl_command_queue &cq,
                         const cl_bool block,
@@ -558,7 +558,7 @@ template<> struct _ucl_memcpy<1,2> {
     assert(0==1);
   }
   template <class p1, class p2>
-  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                         const size_t spitch, const size_t cols,
                         const size_t rows, cl_command_queue &cq,
                         const cl_bool block,
@@ -587,9 +587,9 @@ template <> struct _ucl_memcpy<1,0> {
                                      dst.begin(),0,NULL,NULL));
   }
   template <class p1, class p2>
-  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                         const size_t spitch, const size_t cols,
-                        const size_t rows, cl_command_queue &cq, 
+                        const size_t rows, cl_command_queue &cq,
                         const cl_bool block,
                         size_t dst_offset, size_t src_offset) {
     if (src.cbegin()==dst.cbegin()) {
@@ -602,20 +602,20 @@ template <> struct _ucl_memcpy<1,0> {
     #ifdef UCL_DBG_MEM_TRACE
     std::cerr << "UCL_COPY 2NS\n";
     #endif
-    if (spitch==dpitch && dst.cols()==src.cols() && 
+    if (spitch==dpitch && dst.cols()==src.cols() &&
         src.cols()==cols/src.element_size())
       CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,
                                        spitch*rows,
                                        (char *)dst.begin()+dst_offset,0,NULL,
                                        NULL));
     else
-      for (size_t i=0; i<rows; i++) {                       
+      for (size_t i=0; i<rows; i++) {
         CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,cols,
                                          (char *)dst.begin()+dst_offset,0,NULL,
                                          NULL));
         src_offset+=spitch;
         dst_offset+=dpitch;
-      }                                       
+      }
   }
 };
 
@@ -630,7 +630,7 @@ template <> struct _ucl_memcpy<0,1> {
       #ifdef UCL_DBG_MEM_TRACE
       std::cerr << "UCL_COPY 3S\n";
       #endif
-      return;                        
+      return;
     }
     #ifdef UCL_DBG_MEM_TRACE
     std::cerr << "UCL_COPY 3NS\n";
@@ -639,9 +639,9 @@ template <> struct _ucl_memcpy<0,1> {
                                       src.begin(),0,NULL,NULL));
   }
   template <class p1, class p2>
-  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                         const size_t spitch, const size_t cols,
-                        const size_t rows, cl_command_queue &cq, 
+                        const size_t rows, cl_command_queue &cq,
                         const cl_bool block,
                         size_t dst_offset, size_t src_offset) {
     if (src.cbegin()==dst.cbegin()) {
@@ -649,12 +649,12 @@ template <> struct _ucl_memcpy<0,1> {
       #ifdef UCL_DBG_MEM_TRACE
       std::cerr << "UCL_COPY 4S\n";
       #endif
-      return;                        
+      return;
     }
     #ifdef UCL_DBG_MEM_TRACE
     std::cerr << "UCL_COPY 4NS\n";
     #endif
-    if (spitch==dpitch && dst.cols()==src.cols() && 
+    if (spitch==dpitch && dst.cols()==src.cols() &&
         src.cols()==cols/src.element_size())
       CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,
                                         spitch*rows,
@@ -667,7 +667,7 @@ template <> struct _ucl_memcpy<0,1> {
                                           NULL));
         src_offset+=spitch;
         dst_offset+=dpitch;
-      }                                       
+      }
   }
 };
 
@@ -687,33 +687,33 @@ template <int mem1, int mem2> struct _ucl_memcpy {
     #ifdef UCL_DBG_MEM_TRACE
     else std::cerr << "UCL_COPY 6S\n";
     #endif
-    
+
     if (block==CL_TRUE) ucl_sync(cq);
   }
   template <class p1, class p2>
-  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                         const size_t spitch, const size_t cols,
                         const size_t rows, cl_command_queue &cq,
                         const cl_bool block,
                         size_t dst_offset, size_t src_offset) {
-    if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {                        
+    if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
       #ifdef UCL_DBG_MEM_TRACE
       std::cerr << "UCL_COPY 7NS\n";
       #endif
-      if (spitch==dpitch && dst.cols()==src.cols() && 
+      if (spitch==dpitch && dst.cols()==src.cols() &&
           src.cols()==cols/src.element_size())
         CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
                                          dst_offset,spitch*rows,0,NULL,NULL));
-        
+
       else
-        for (size_t i=0; i<rows; i++) {                       
+        for (size_t i=0; i<rows; i++) {
           CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),
                                            src_offset,dst_offset,cols,0,
                                            NULL,NULL));
           src_offset+=spitch;
           dst_offset+=dpitch;
-        }                                       
-    }                                 
+        }
+    }
     #ifdef UCL_DBG_MEM_TRACE
     else std::cerr << "UCL_COPY 7S\n";
     #endif
@@ -736,8 +736,8 @@ inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n,
 }
 
 template<class mat1, class mat2>
-inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, 
-                       const size_t spitch, const size_t cols, 
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+                       const size_t spitch, const size_t cols,
                        const size_t rows) {
   _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
                                                  rows,dst.cq(),CL_TRUE,
@@ -745,15 +745,15 @@ inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
 }
 
 template<class mat1, class mat2>
-inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, 
-                           const size_t spitch, const size_t cols, 
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+                           const size_t spitch, const size_t cols,
                            const size_t rows,cl_command_queue &cq) {
   _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
                                                  rows,cq,CL_FALSE,
                                                  dst.byteoff(),src.byteoff());
 }
 
-} // namespace ucl_cudart 
+} // namespace ucl_cudart
 
 #endif
 
diff --git a/lib/gpu/geryon/ocl_texture.h b/lib/gpu/geryon/ocl_texture.h
index 8e72c51730..0e60045f55 100644
--- a/lib/gpu/geryon/ocl_texture.h
+++ b/lib/gpu/geryon/ocl_texture.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -28,7 +28,7 @@
 #include "ocl_mat.h"
 
 namespace ucl_opencl {
-    
+
 /// Class storing a texture reference
 class UCL_Texture {
  public:
@@ -46,9 +46,9 @@ class UCL_Texture {
   /// Unbind the texture reference from the memory allocation
   inline void unbind() { }
 
-  /// Make a texture reference available to kernel  
+  /// Make a texture reference available to kernel
   inline void allow(UCL_Kernel &kernel) { }
-  
+
  private:
   friend class UCL_Kernel;
 };
diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h
index 627d19d66f..66b79dcab1 100644
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -67,33 +67,33 @@ class UCL_Timer {
     clRetainCommandQueue(_cq);
     _initialized=true;
   }
-  
+
   /// Start timing on default command queue
   inline void start() { UCL_OCL_MARKER(_cq,&start_event); }
-  
+
   /// Stop timing on default command queue
   inline void stop() { UCL_OCL_MARKER(_cq,&stop_event); }
-  
+
   /// Block until the start event has been reached on device
-  inline void sync_start() 
+  inline void sync_start()
     { CL_SAFE_CALL(clWaitForEvents(1,&start_event)); }
 
   /// Block until the stop event has been reached on device
-  inline void sync_stop() 
+  inline void sync_stop()
     { CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); }
 
   /// Set the time elapsed to zero (not the total_time)
-  inline void zero() 
-    { UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); } 
-  
+  inline void zero()
+    { UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); }
+
   /// Set the total time to zero
   inline void zero_total() { _total_time=0.0; }
-  
+
   /// Add time from previous start and stop to total
   /** Forces synchronization **/
-  inline double add_to_total() 
+  inline double add_to_total()
     { double t=time(); _total_time+=t; return t/1000.0; }
-  
+
   /// Add a user specified time to the total (ms)
   inline void add_time_to_total(const double t) { _total_time+=t; }
 
@@ -107,12 +107,12 @@ class UCL_Timer {
     CL_SAFE_CALL(clGetEventProfilingInfo(start_event,
                                          CL_PROFILING_COMMAND_END,
                                          sizeof(cl_ulong), &tstart, NULL));
-    return (tend-tstart)*t_factor; 
+    return (tend-tstart)*t_factor;
   }
-  
+
   /// Return the time (s) of last start to stop - Forces synchronization
   inline double seconds() { return time()/1000.0; }
-  
+
   /// Return the total time in ms
   inline double total_time() { return _total_time; }
 
diff --git a/lib/gpu/geryon/ucl_arg_kludge.h b/lib/gpu/geryon/ucl_arg_kludge.h
index 646aa4d68f..eea913863d 100644
--- a/lib/gpu/geryon/ucl_arg_kludge.h
+++ b/lib/gpu/geryon/ucl_arg_kludge.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -38,47 +38,47 @@
 
   template <class t1, class t2, class t3, class t4, class t5>
   inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
   }
 
   template <class t1, class t2, class t3, class t4, class t5,
             class t6>
   inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                        t6 *a6) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6);  
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6);
   }
 
   template <class t1, class t2, class t3, class t4, class t5,
             class t6, class t7>
   inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                        t6 *a6, t7 *a7) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7);
   }
 
   template <class t1, class t2, class t3, class t4, class t5,
             class t6, class t7, class t8>
   inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                        t6 *a6, t7 *a7, t8 *a8) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8);  
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8);
   }
 
   template <class t1, class t2, class t3, class t4, class t5,
             class t6, class t7, class t8, class t9>
   inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
   }
 
   template <class t1, class t2, class t3, class t4, class t5,
             class t6, class t7, class t8, class t9, class t10>
   inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
   }
 
   template <class t1, class t2, class t3, class t4, class t5,
@@ -87,9 +87,9 @@
   inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                        t11 *a11) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11);
   }
 
   template <class t1, class t2, class t3, class t4, class t5,
@@ -98,8 +98,8 @@
   inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                        t11 *a11, t12 *a12) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
     add_arg(a11); add_arg(a12);
   }
 
@@ -109,9 +109,9 @@
   inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                        t11 *a11, t12 *a12, t13 *a13) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13);  
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13);
   }
 
   template <class t1, class t2, class t3, class t4, class t5,
@@ -120,9 +120,9 @@
   inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);  
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
   }
 
   template <class t1, class t2, class t3, class t4, class t5,
@@ -131,9 +131,9 @@
   inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
   }
 
   template <class t1, class t2, class t3, class t4, class t5,
@@ -144,10 +144,10 @@
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                        t16 *a16) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16);
   }
 
   template <class t1, class t2, class t3, class t4, class t5,
@@ -158,10 +158,10 @@
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                        t16 *a16, t17 *a17) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17);
   }
 
   template <class t1, class t2, class t3, class t4, class t5,
@@ -172,10 +172,10 @@
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                        t16 *a16, t17 *a17, t18 *a18) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18);
   }
 
   template <class t1, class t2, class t3, class t4, class t5,
@@ -186,10 +186,10 @@
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
   }
 
   template <class t1, class t2, class t3, class t4, class t5,
@@ -200,10 +200,10 @@
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
   }
 
   template <class t1, class t2, class t3, class t4, class t5,
@@ -216,10 +216,10 @@
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                        t21 *a21) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
     add_arg(a21);
   }
 
@@ -233,10 +233,10 @@
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                        t21 *a21, t22 *a22) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
     add_arg(a21); add_arg(a22);
   }
 
@@ -250,10 +250,10 @@
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                        t21 *a21, t22 *a22, t23 *a23) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
     add_arg(a21); add_arg(a22); add_arg(a23);
   }
 
@@ -267,10 +267,10 @@
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                        t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
     add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
   }
 
@@ -284,11 +284,11 @@
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                        t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
   }
 
   template <class t1, class t2, class t3, class t4, class t5,
@@ -303,11 +303,11 @@
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                        t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                        t26 *a26) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
     add_arg(a26);
   }
 
@@ -323,11 +323,11 @@
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                        t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                        t26 *a26, t27 *a27) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
     add_arg(a26); add_arg(a27);
   }
 
@@ -343,11 +343,11 @@
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                        t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                        t26 *a26, t27 *a27, t28 *a28) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
     add_arg(a26); add_arg(a27); add_arg(a28);
   }
 
@@ -363,11 +363,11 @@
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                        t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                        t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
     add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
   }
 
@@ -383,12 +383,12 @@
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                        t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                        t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
+    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
   }
 
 
@@ -425,7 +425,7 @@
   template <class t1, class t2, class t3, class t4, class t5>
   inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
     run();
   }
 
@@ -434,8 +434,8 @@
   inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                        t6 *a6) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6);  
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6);
     run();
   }
 
@@ -444,8 +444,8 @@
   inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                        t6 *a6, t7 *a7) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7);
     run();
   }
 
@@ -454,8 +454,8 @@
   inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                        t6 *a6, t7 *a7, t8 *a8) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8);  
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8);
     run();
   }
 
@@ -464,8 +464,8 @@
   inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
     run();
   }
 
@@ -474,8 +474,8 @@
   inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
     run();
   }
 
@@ -486,9 +486,9 @@
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                        t11 *a11) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11);
     run();
   }
 
@@ -499,8 +499,8 @@
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                        t11 *a11, t12 *a12) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
     add_arg(a11); add_arg(a12);
     run();
   }
@@ -512,9 +512,9 @@
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                        t11 *a11, t12 *a12, t13 *a13) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13);  
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13);
     run();
   }
 
@@ -525,9 +525,9 @@
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);  
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
     run();
   }
 
@@ -538,9 +538,9 @@
                        t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
     run();
   }
 
@@ -553,10 +553,10 @@
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                        t16 *a16) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16);
     run();
   }
 
@@ -569,10 +569,10 @@
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                        t16 *a16, t17 *a17) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17);
     run();
   }
 
@@ -585,10 +585,10 @@
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                        t16 *a16, t17 *a17, t18 *a18) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18);
     run();
   }
 
@@ -601,10 +601,10 @@
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
     run();
   }
 
@@ -617,10 +617,10 @@
                        t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
     run();
   }
 
@@ -635,10 +635,10 @@
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                        t21 *a21) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
     add_arg(a21);
     run();
   }
@@ -654,10 +654,10 @@
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                        t21 *a21, t22 *a22) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
     add_arg(a21); add_arg(a22);
     run();
   }
@@ -673,10 +673,10 @@
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                        t21 *a21, t22 *a22, t23 *a23) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
     add_arg(a21); add_arg(a22); add_arg(a23);
     run();
   }
@@ -692,10 +692,10 @@
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                        t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
     add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
     run();
   }
@@ -711,11 +711,11 @@
                        t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                        t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
     run();
   }
 
@@ -732,11 +732,11 @@
                        t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                        t26 *a26) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
     add_arg(a26);
     run();
   }
@@ -754,11 +754,11 @@
                        t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                        t26 *a26, t27 *a27) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
     add_arg(a26); add_arg(a27);
     run();
   }
@@ -776,12 +776,12 @@
                        t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                        t26 *a26, t27 *a27, t28 *a28) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    add_arg(a26); add_arg(a27); add_arg(a28); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
+    add_arg(a26); add_arg(a27); add_arg(a28);
     run();
   }
 
@@ -798,11 +798,11 @@
                        t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                        t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
     add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
     run();
   }
@@ -820,11 +820,11 @@
                        t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                        t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
     clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
+    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
     run();
   }
diff --git a/lib/gpu/geryon/ucl_basemat.h b/lib/gpu/geryon/ucl_basemat.h
index 4edf83e057..1ded9f043b 100644
--- a/lib/gpu/geryon/ucl_basemat.h
+++ b/lib/gpu/geryon/ucl_basemat.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -52,10 +52,10 @@
 /// Base class for vector/matrix containers
 /** All containers are associated with a default command queue.
   * For CUDA, this is the default stream.
-  * 
-  * The default queue is used for asynchonrous operations on the container 
+  *
+  * The default queue is used for asynchonrous operations on the container
   * that do not specify a queue. For OpenCL, this queue is also used in
-  * calls for reserving and copying memory **/ 
+  * calls for reserving and copying memory **/
 class UCL_BaseMat {
  public:
   UCL_BaseMat() : _cq(0), _kind(UCL_VIEW) { }
@@ -68,8 +68,8 @@ class UCL_BaseMat {
   inline void sync() { ucl_sync(_cq); }
   /// Return the type/permissions of memory allocation
   /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED
-    * or UCL_VIEW **/ 
-  inline enum UCL_MEMOPT kind() const { return _kind; }  
+    * or UCL_VIEW **/
+  inline enum UCL_MEMOPT kind() const { return _kind; }
 
   inline bool shared_mem_device() {
     #ifdef _OCL_MAT
@@ -79,12 +79,12 @@ class UCL_BaseMat {
     cl_device_type device_type;
     CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
                                  sizeof(device_type),&device_type,NULL));
-    return _shared_mem_device(device_type);                                       
+    return _shared_mem_device(device_type);
     #else
     return false;
     #endif
   }
-  
+
  protected:
   command_queue _cq;
   enum UCL_MEMOPT _kind;
diff --git a/lib/gpu/geryon/ucl_copy.h b/lib/gpu/geryon/ucl_copy.h
index c6bff97a8c..c906a14f30 100644
--- a/lib/gpu/geryon/ucl_copy.h
+++ b/lib/gpu/geryon/ucl_copy.h
@@ -17,33 +17,33 @@
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
-   
+
 /***************************************************************************
    The ucl_copy and ucl_cast_copy routines provide a general prototype for
    copying data between host and device memory (including texture memory)
    for the matrix and vector types in nvc_memory.
-   
-   For host/host and host/device transfers, typecasting is performed 
-   automatically as necessary. 
-   
-   The routines are written so that all branches can be removed by the 
+
+   For host/host and host/device transfers, typecasting is performed
+   automatically as necessary.
+
+   The routines are written so that all branches can be removed by the
    compiler during template instantiation.
-   
+
    The routines currently assume row-major ordering for all types.
-   
+
    For asynchronous copy in the default command queue, async is boolean true;
    For asynchronous copy in a specified command queue, async is command queue
    Otherwise, set async to boolean false;
-   
+
    When performing frequent data copies that require casting, it is more
    efficient to allocate a casting buffer once and then pass that buffer
    to the copy routine. This can be accomplished with the ucl_cast_copy
    routines.
-   
-   Examples 
+
+   Examples
       (x's represent alignment padding - to maintain alignment)
       (o's represent a larger matrix in memory)
       (vectors represented as single row)
@@ -51,18 +51,18 @@
        dst           src            command
    ----------------------------------------------------------------
     0 1 2 3 4 <-- 0 1 2 3 4          ucl_copy(dst,src,async)
-    
+
     0 1 2 3   <-- 0 1 2 3 4          ucl_copy(dst,src,4,async)
-    
+
     0 1 2     <-- 0 1 2 3 4 5        ucl_copy(dst,src,async)
-    3 4 5 
-   
+    3 4 5
+
     0 1 2 3 4 5 <-- 0 1 2            ucl_copy(dst,src,async)
                     3 4 5
-                    
+
     0 1 2      <--  0 1 2            ucl_copy(dst,src,async)
     3 4 5           3 4 5
-    
+
     0 1 2      <--  0 1 2            ucl_copy(dst,src,6,async)
     3 4 5           3 4 5
                     5 6 7
@@ -70,33 +70,33 @@
     0 1 2      <--  0  1  2  3       ucl_copy(dst,src,2,3,async)
     4 5 6           4  5  6  7
                     8  9  10 11
-    
+
     0 1 2 x x  <--  0 1 2            ucl_copy(dst,src,async)
     3 4 5 x x       3 4 5
-    
+
     0 1 2      <--  0 1 2 x x        ucl_copy(dst,src,async)
     3 4 5           3 4 5 x x
-    
+
     0 1 2 o o  <--  0 1 2            ucl_copy(dst,src,2,3,async)
     3 4 5 o o       3 4 5
-    o o o o o       
+    o o o o o
 
     0 1 2 o o  <--  0 1 2 3 4 5      ucl_copy(dst,src,2,3,async)
-    3 4 5 o o       
-    o o o o o       
+    3 4 5 o o
+    o o o o o
 
     0 1 o o o  <--  0 1 2 3 4 5      ucl_copy(dst,src,2,2,async)
-    2 3 o o o       
-    o o o o o       
+    2 3 o o o
+    o o o o o
 
     0 1 2 o o  <--  0  1  2  3  4    ucl_copy(dst,src,2,3,async)
     5 6 7 o o       5  6  7  8  9
     o o o o o       10 11 12 13 14
-    
+
     0 1 2 5 6 7  <--  0  1  2  3  4  ucl_copy(dst,src,2,3,async)
                       5  6  7  8  9
                       10 11 12 13 14
-    
+
  ***************************************************************************/
 
 // Only allow this file to be included by nvc_memory.h and ocl_memory.h
@@ -124,7 +124,7 @@ inline void _check_ucl_copy_perm(mat1 &dst, mat2 &src) {
       assert(0==1);
     }
   }
-} 
+}
 
 // --------------------------------------------------------------------------
 // - HOST-HOST COPY ROUTINES
@@ -182,7 +182,7 @@ template <> struct _host_host_copy<1,1> {
         return;
       }
       #endif
-      
+
       #ifdef UCL_DBG_MEM_TRACE
       std::cerr << "UCL_COPY 8NS\n";
       #endif
@@ -212,7 +212,7 @@ template <int host_t1, int host_t2> struct _host_host_copy {
   static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
                          const size_t cols) {
     assert(0==1);
-  }                         
+  }
 };
 
 // --------------------------------------------------------------------------
@@ -242,20 +242,20 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
   template <class mat1, class mat2, class mat3>
   static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
                         const size_t cols, mat3 &cast_buffer) {
-    // Asynchronous currently pointless here 
+    // Asynchronous currently pointless here
     #ifdef UCL_DEBUG
     assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
     assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
-    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);    
-    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);    
-    #endif    
+    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
+    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
+    #endif
     if (mat1::VECTOR) {
       ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
                  src.row_bytes(),cols*sizeof(typename mat2::data_type),rows);
       for (size_t i=0; i<rows*cols; i++)
         dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
     } else {
-      if (mat2::VECTOR) 
+      if (mat2::VECTOR)
         ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
                    cols*sizeof(typename mat2::data_type),
                    cols*sizeof(typename mat2::data_type),rows);
@@ -276,23 +276,23 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
   }
   template <class mat1, class mat2, class mat3>
   static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer, 
+                        const size_t cols, mat3 &cast_buffer,
                         command_queue &cq) {
-    // Asynchronous currently pointless here 
+    // Asynchronous currently pointless here
     #ifdef UCL_DEBUG
     assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
     assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
-    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);    
-    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);    
-    #endif    
+    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
+    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
+    #endif
     if (mat1::VECTOR) {
       ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
                  src.row_bytes(),cols*sizeof(typename mat2::data_type),rows,cq);
-      cast_buffer.sync();           
+      cast_buffer.sync();
       for (size_t i=0; i<rows*cols; i++)
         dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
     } else {
-      if (mat2::VECTOR) 
+      if (mat2::VECTOR)
         ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
                    cols*sizeof(typename mat2::data_type),
                    cols*sizeof(typename mat2::data_type),rows,cq);
@@ -338,7 +338,7 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
     assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
     if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
     if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
-    if (mat3::VECTOR==0) { 
+    if (mat3::VECTOR==0) {
       assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
       assert(dst.rows()>=rows && dst.cols()>=cols);
     }
@@ -404,9 +404,9 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
     #ifdef UCL_DEBUG
     assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
     assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
-    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);    
-    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);    
-    if (mat3::VECTOR==0) { 
+    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
+    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
+    if (mat3::VECTOR==0) {
       assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
       assert(dst.rows()>=rows && dst.cols()>=cols);
     }
@@ -472,23 +472,23 @@ template <> struct _ucl_cast_copy<1,1> {
   template <class mat1, class mat2, class mat3>
   static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
                         mat3 &cast_buffer, command_queue &cq) {
-    assert(0==1);                        
+    assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
   static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
                         mat3 &cast_buffer) {
-    assert(0==1);                        
+    assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
   static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
                         const size_t cols, mat3 &cast_buffer) {
-    assert(0==1);                        
+    assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
   static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
                         const size_t cols, mat3 &cast_buffer,
                         command_queue &cq) {
-    assert(0==1);                        
+    assert(0==1);
   }
 };
 
@@ -497,23 +497,23 @@ template <> struct _ucl_cast_copy<0,0> {
   template <class mat1, class mat2, class mat3>
   static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
                         mat3 &cast_buffer, command_queue &cq) {
-    assert(0==1);                        
+    assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
   static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
                         mat3 &cast_buffer) {
-    assert(0==1);                        
+    assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
   static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
                         const size_t cols, mat3 &cast_buffer) {
-    assert(0==1);                        
+    assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
   static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
                         const size_t cols, mat3 &cast_buffer,
                         command_queue &cq) {
-    assert(0==1);                        
+    assert(0==1);
   }
 };
 
@@ -525,7 +525,7 @@ template <> struct _ucl_cast_copy<0,0> {
 /** \param numel Number of elements (not bytes) to copy
   * \param cast_buffer Buffer on host with enough storage for casting
   * - If the data types for the two matrices are same, no cast performed
-  * - Padding for 2D matrices is not considered in this routine. 
+  * - Padding for 2D matrices is not considered in this routine.
   * - Currently does not handle textures **/
 template <class mat1, class mat2, class mat3>
 inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
@@ -551,7 +551,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
   * \param async Perform non-blocking copy on default stream
   * \param cast_buffer Buffer on host with enough storage for casting
   * - If the data types for the two matrices are same, no cast performed
-  * - Padding for 2D matrices is not considered in this routine. 
+  * - Padding for 2D matrices is not considered in this routine.
   * - Currently does not handle textures **/
 template <class mat1, class mat2, class mat3>
 inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
@@ -580,7 +580,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
   *   buffer is created for copy. When multiple casts occur, it is
   *   more efficient to create a permanent casting buffer that can
   *   be passed to an alternative  copy routine.
-  * - Padding for 2D matrices is not considered in this routine. 
+  * - Padding for 2D matrices is not considered in this routine.
   * - Currently does not handle textures **/
 template <class mat1, class mat2>
 inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
@@ -593,7 +593,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
   #endif
   if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
     _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
-  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && 
+  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
       (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
     if (mat1::MEM_TYPE==1) {
       UCL_H_Vec<typename mat2::data_type> cast_buffer;
@@ -606,8 +606,8 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
       _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
                                                         cast_buffer,cq);
     }
-  } else 
-    ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq); 
+  } else
+    ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq);
 }
 
 /// Copy matrix/vector (memory already allocated)
@@ -619,7 +619,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
   *   buffer is created for copy. When multiple casts occur, it is
   *   more efficient to create a permanent casting buffer that can
   *   be passed to an alternative  copy routine.
-  * - Padding for 2D matrices is not considered in this routine. 
+  * - Padding for 2D matrices is not considered in this routine.
   * - The default stream is used for asynchronous copy
   * - Currently does not handle textures **/
 template <class mat1, class mat2>
@@ -648,7 +648,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
                                                         cast_buffer);
     }
   } else
-    ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type)); 
+    ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type));
 }
 
 // --------------------------------------------------------------------------
@@ -659,11 +659,11 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
 /** \param async Perform non-blocking copy on default stream
   * \param cast_buffer Buffer on host with enough storage for casting
   * - If src is a vector, routine assumes row-major rows by cols copy
-  * - If src is a matrix, routine will copy upper left tile of matrix 
+  * - If src is a matrix, routine will copy upper left tile of matrix
   * - If dst is a vector, routine assumes row-major rows by cols copy
-  * - If dst is a matrix, routine will copy into left tile of matrix 
+  * - If dst is a matrix, routine will copy into left tile of matrix
   * - If the data types for the two matrices are same, no cast performed
-  * - Padding for 2D matrices is not considered in this routine. 
+  * - Padding for 2D matrices is not considered in this routine.
   * - Copy from vector to matrix and vice versa allowed
   * - Currently does not handle textures **/
 template <class mat1, class mat2, class mat3>
@@ -686,16 +686,16 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
 /// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
 /** \param cast_buffer Buffer on host with enough storage for casting
   * - If src is a vector, routine assumes row-major rows by cols copy
-  * - If src is a matrix, routine will copy upper left tile of matrix 
+  * - If src is a matrix, routine will copy upper left tile of matrix
   * - If dst is a vector, routine assumes row-major rows by cols copy
-  * - If dst is a matrix, routine will copy into upper left tile of matrix 
+  * - If dst is a matrix, routine will copy into upper left tile of matrix
   * - If the data types for the two matrices are same, no cast performed
-  * - Padding for 2D matrices is not considered in this routine. 
+  * - Padding for 2D matrices is not considered in this routine.
   * - Copy from vector to matrix and vice versa allowed
   * - Currently does not handle textures **/
 template <class mat1, class mat2, class mat3>
 inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
-                          const size_t cols, mat3 &cast_buffer, 
+                          const size_t cols, mat3 &cast_buffer,
                           command_queue &cq) {
   if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
     ucl_copy(dst,src,rows,cols,cq);
@@ -710,11 +710,11 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
 
 /// Asynchronous copy of subset matrix rows,cols (memory already allocated)
 /** - If src is a vector, routine assumes row-major rows by cols copy
-  * - If src is a matrix, routine will copy upper left tile of matrix 
+  * - If src is a matrix, routine will copy upper left tile of matrix
   * - If dst is a vector, routine assumes row-major rows by cols copy
-  * - If dst is a matrix, routine will copy into left tile of matrix 
+  * - If dst is a matrix, routine will copy into left tile of matrix
   * - If the data types of the two matrices are not the same,
-  *   casting will be performed automatically as long as the copy is 
+  *   casting will be performed automatically as long as the copy is
   *   not device to device. For host/device transfers, a temporary
   *   buffer is created for copy. When multiple casts occur, it is
   *   more efficient to create a permanent casting buffer that can
@@ -730,7 +730,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
   #endif
   if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
     _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
-  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && 
+  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
            (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
     if (mat1::MEM_TYPE==1) {
       UCL_H_Vec<typename mat2::data_type> cast_buffer;
@@ -773,9 +773,9 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
 /// Copy subset of matrix rows,cols (memory already allocated)
 /** \param async Perform non-blocking copy (ignored for host to host copy)
   * - If src is a vector, routine assumes row-major rows by cols copy
-  * - If src is a matrix, routine will copy upper left tile of matrix 
+  * - If src is a matrix, routine will copy upper left tile of matrix
   * - If dst is a vector, routine assumes row-major rows by cols copy
-  * - If dst is a matrix, routine will copy into left tile of matrix 
+  * - If dst is a matrix, routine will copy into left tile of matrix
   * - If the data types of the two matrices are not the same,
   *   casting will be performed automatically as long as the copy is
   *   not device to device. For host/device transfers, a temporary
@@ -796,7 +796,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
     ucl_copy(dst,src,rows,cols,dst.cq());
   else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
     _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
-  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && 
+  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
            (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
     if (mat1::MEM_TYPE==1) {
       UCL_H_Vec<typename mat2::data_type> cast_buffer;
@@ -846,7 +846,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
   * \param cast_buffer Buffer on host with enough storage for casting
   * - If the data types for the two matrices are same, no cast performed
   * - The number of bytes copied is determined by entire src data
-  * - Padding for 2D matrices is not considered in this routine. 
+  * - Padding for 2D matrices is not considered in this routine.
   * - Copy from vector to matrix and vice versa allowed
   * - Currently does not handle textures **/
 template <class mat1, class mat2, class mat3>
@@ -866,7 +866,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
 /** \param cast_buffer Buffer on host with enough storage for casting
   * - If the data types for the two matrices are same, no cast performed
   * - The number of bytes copied is determined by entire src data
-  * - Padding for 2D matrices is not considered in this routine. 
+  * - Padding for 2D matrices is not considered in this routine.
   * - Copy from vector to matrix and vice versa allowed
   * - Currently does not handle textures **/
 template <class mat1, class mat2, class mat3>
@@ -885,7 +885,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
 /// Asynchronous copy of matrix/vector (memory already allocated)
 /** - The number of bytes copied is determined by entire src data
   * - If the data types of the two matrices are not the same,
-  *   casting will be performed automatically as long as the copy is 
+  *   casting will be performed automatically as long as the copy is
   *   not device to device. For host/device transfers, a temporary
   *   buffer is created for copy. When multiple casts occur, it is
   *   more efficient to create a permanent casting buffer that can
@@ -924,7 +924,7 @@ template <class mat1, class mat2>
 inline void ucl_copy(mat1 &dst, const mat2 &src, const bool async) {
   if (async)
     ucl_copy(dst,src,dst.cq());
-  else if (dst.row_bytes()==src.row_bytes() && 
+  else if (dst.row_bytes()==src.row_bytes() &&
            src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
            (int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
     ucl_copy(dst,src,src.row_size()*src.rows(),async);
diff --git a/lib/gpu/geryon/ucl_d_mat.h b/lib/gpu/geryon/ucl_d_mat.h
index f1aaa27903..da55cc6ebc 100644
--- a/lib/gpu/geryon/ucl_d_mat.h
+++ b/lib/gpu/geryon/ucl_d_mat.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -37,23 +37,23 @@ class UCL_D_Mat : public UCL_BaseMat {
     ROW_MAJOR = 1,
     VECTOR = 0
   };
-  typedef numtyp data_type; 
+  typedef numtyp data_type;
 
   UCL_D_Mat() : _cols(0) {}
   ~UCL_D_Mat() { _device_free(*this); }
-  
+
   /// Construct with specified rows and cols
   /** \sa alloc() **/
   UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device,
-            const enum UCL_MEMOPT kind=UCL_READ_WRITE) : 
+            const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
     _cols(0) { alloc(rows,cols,device,kind); }
-    
+
   /// Row major matrix on device
   /** The kind parameter controls memory optimizations as follows:
     * - UCL_READ_WRITE - Specify that you will read and write in kernels
     * - UCL_WRITE_ONLY - Specify that you will only write in kernels
     * - UCL_READ_ONLY  - Specify that you will only read in kernels
-    * \param cq Default command queue for operations copied from another mat 
+    * \param cq Default command queue for operations copied from another mat
     * \note - Coalesced access using adjacent cols on same row
     *         UCL_D_Mat(row,col) given by array[row*row_size()+col]
     * \return UCL_SUCCESS if the memory allocation is successful **/
@@ -65,7 +65,7 @@ class UCL_D_Mat : public UCL_BaseMat {
     int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
     if (err!=UCL_SUCCESS) {
       #ifndef UCL_NO_EXIT
-      std::cerr << "UCL Error: Could not allocate " 
+      std::cerr << "UCL Error: Could not allocate "
                 << rows*cols*sizeof(numtyp) << " bytes on device.\n";
       UCL_GERYON_EXIT;
       #endif
@@ -82,9 +82,9 @@ class UCL_D_Mat : public UCL_BaseMat {
     #ifdef _OCL_MAT
     _offset=0;
     #endif
-    return err; 
+    return err;
   }
-  
+
   /// Row major matrix on device
   /** The kind parameter controls memory optimizations as follows:
     * - UCL_READ_WRITE - Specify that you will read and write in kernels
@@ -118,15 +118,15 @@ class UCL_D_Mat : public UCL_BaseMat {
     #ifdef _OCL_MAT
     _offset=0;
     #endif
-    return err; 
+    return err;
   }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * \param stride Number of _elements_ between the start of each row **/
   template <class ucl_type>
   inline void view(ucl_type &input, const size_t rows, const size_t cols,
                    const size_t stride) {
@@ -145,7 +145,7 @@ class UCL_D_Mat : public UCL_BaseMat {
     #else
     _device_view(&_array,input.begin());
     #endif
-    
+
     #ifndef _UCL_DEVICE_PTR_MAT
     _end=_array+_cols;
     #endif
@@ -157,39 +157,39 @@ class UCL_D_Mat : public UCL_BaseMat {
     * - The view does not prevent the memory from being freed by the
     *   allocating container when using CUDA APIs **/
   template <class ucl_type>
-  inline void view(ucl_type &input, const size_t rows, const size_t cols) 
+  inline void view(ucl_type &input, const size_t rows, const size_t cols)
     { view(input,rows,cols,input.row_size()); }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view **/
   template <class ucl_type>
   inline void view(ucl_type &input, const size_t cols)
     { view(input,1,cols); }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view **/
   template <class ucl_type>
-  inline void view(ucl_type &input) 
+  inline void view(ucl_type &input)
     { view(input,input.rows(),input.cols()); }
-  
+
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * \param stride Number of _elements_ between the start of each row **/
   template <class ptr_type>
   inline void view(ptr_type input, const size_t rows, const size_t cols,
-                   const size_t stride, UCL_Device &dev) { 
+                   const size_t stride, UCL_Device &dev) {
     clear();
     _kind=UCL_VIEW;
     _cols=cols;
@@ -215,7 +215,7 @@ class UCL_D_Mat : public UCL_BaseMat {
   template <class ptr_type>
   inline void view(ptr_type input, const size_t rows, const size_t cols,
                    UCL_Device &dev) { view(input,rows,cols,cols,dev); }
-  
+
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
@@ -223,13 +223,13 @@ class UCL_D_Mat : public UCL_BaseMat {
   template <class ptr_type>
   inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
     { view(input,1,cols,dev); }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * \param stride Number of _elements_ between the start of each row **/
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
                           const size_t cols, const size_t stride) {
@@ -248,7 +248,7 @@ class UCL_D_Mat : public UCL_BaseMat {
     #else
     _device_view(&_array,input.begin(),offset,sizeof(numtyp));
     #endif
-    
+
     #ifndef _UCL_DEVICE_PTR_MAT
     _end=_array+_cols;
     #endif
@@ -261,45 +261,45 @@ class UCL_D_Mat : public UCL_BaseMat {
     *   allocating container when using CUDA APIs **/
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
-                          const size_t cols) 
+                          const size_t cols)
     { view_offset(offset,input,rows,cols,input.row_size()); }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view **/
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
     { view_offset(offset,input,1,cols); }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view **/
   template <class ucl_type>
-  inline void view_offset(const size_t offset, ucl_type &input) { 
-    if (input.rows()==1) 
+  inline void view_offset(const size_t offset, ucl_type &input) {
+    if (input.rows()==1)
       view_offset(offset,input,1,input.cols()-offset);
-    else 
+    else
       view_offset(offset,input,input.rows()-offset/input.row_size(),
                   input.cols());
   }
-    
+
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * \param stride Number of _elements_ between the start of each row **/
   template <class ptr_type>
   inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
                           const size_t cols,const size_t stride,
-                          UCL_Device &dev) { 
+                          UCL_Device &dev) {
     clear();
     _kind=UCL_VIEW;
     _cols=cols;
@@ -307,7 +307,7 @@ class UCL_D_Mat : public UCL_BaseMat {
     _pitch=stride*sizeof(numtyp);
     _row_size=stride;
     this->_cq=dev.cq();
-    
+
     #ifdef _OCL_MAT
     _array=input;
     _offset=offset;
@@ -320,7 +320,7 @@ class UCL_D_Mat : public UCL_BaseMat {
     _array=input+offset;
     #endif
     #endif
-    
+
     #ifndef _UCL_DEVICE_PTR_MAT
     _end=_array+_cols;
     #endif
@@ -332,20 +332,20 @@ class UCL_D_Mat : public UCL_BaseMat {
     *   allocating container when using CUDA APIs **/
   template <class ptr_type>
   inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
-                          const size_t cols, UCL_Device &dev) 
+                          const size_t cols, UCL_Device &dev)
     { view_offset(offset,input,rows,cols,cols,dev); }
-  
+
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
     *   allocating container when using CUDA APIs **/
   template <class ptr_type>
-  inline void view_offset(const size_t offset, ptr_type input, 
+  inline void view_offset(const size_t offset, ptr_type input,
                           const size_t cols, UCL_Device &dev)
     { view_offset(offset,input,1,cols,dev); }
-  
+
   /// Free memory and set size to 0
-  inline void clear() 
+  inline void clear()
     { _device_free(*this); _cols=0; _kind=UCL_VIEW; }
 
   /// Resize the allocation to contain cols elements
@@ -356,7 +356,7 @@ class UCL_D_Mat : public UCL_BaseMat {
     int err=_device_resize(*this,rows,cols,_pitch);
     if (err!=UCL_SUCCESS) {
       #ifndef UCL_NO_EXIT
-      std::cerr << "UCL Error: Could not allocate " 
+      std::cerr << "UCL Error: Could not allocate "
                 << rows*cols*sizeof(numtyp) << " bytes on device.\n";
       UCL_GERYON_EXIT;
       #endif
@@ -372,13 +372,13 @@ class UCL_D_Mat : public UCL_BaseMat {
     #ifdef _OCL_MAT
     _offset=0;
     #endif
-    return err; 
+    return err;
   }
-    
+
   /// Resize (only if bigger) the allocation to contain rows x cols elements
   /** \note Cannot be used on views **/
   inline int resize_ib(const int rows, const int cols)
-    { if (cols>_cols || rows>_rows) return resize(rows,cols); 
+    { if (cols>_cols || rows>_rows) return resize(rows,cols);
       else return UCL_SUCCESS; }
 
   /// Set each element to zero asynchronously in the default command_queue
@@ -386,10 +386,10 @@ class UCL_D_Mat : public UCL_BaseMat {
   /// Set first n elements to zero asynchronously in the default command_queue
   inline void zero(const int n) { zero(n,_cq); }
   /// Set each element to zero asynchronously
-  inline void zero(command_queue &cq) 
+  inline void zero(command_queue &cq)
     { _device_zero(*this,row_bytes()*_rows,cq); }
   /// Set first n elements to zero asynchronously
-  inline void zero(const int n, command_queue &cq) 
+  inline void zero(const int n, command_queue &cq)
     { _device_zero(*this,n*sizeof(numtyp),cq); }
 
 
@@ -445,7 +445,7 @@ class UCL_D_Mat : public UCL_BaseMat {
   inline size_t row_bytes() const { return _pitch; }
   /// Get the size in bytes of 1 element
   inline int element_size() const { return sizeof(numtyp); }
-  
+
   #ifdef _OCL_MAT
   /// Return the offset (in elements) from begin() pointer where data starts
   /** \note Always 0 for host matrices and CUDA APIs **/
@@ -459,7 +459,7 @@ class UCL_D_Mat : public UCL_BaseMat {
   /// Return the offset (in bytes) from begin() pointer where data starts
   /** \note Always 0 for host matrices and CUDA APIs **/
   inline size_t byteoff() const { return offset()*sizeof(numtyp); }
-  
+
  private:
   size_t _pitch, _row_size, _rows, _cols;
 
diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h
index fc1977f4b5..99a6c939c6 100644
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@@ -17,14 +17,14 @@
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
 // Only allow this file to be included by CUDA and OpenCL specific headers
 #ifdef _UCL_MAT_ALLOW
 
-/// Row vector on device 
+/// Row vector on device
 template <class numtyp>
 class UCL_D_Vec : public UCL_BaseMat {
  public:
@@ -37,7 +37,7 @@ class UCL_D_Vec : public UCL_BaseMat {
     ROW_MAJOR = 1,
     VECTOR = 1
   };
-  typedef numtyp data_type; 
+  typedef numtyp data_type;
 
   UCL_D_Vec() : _cols(0) {}
   ~UCL_D_Vec() { _device_free(*this); }
@@ -45,7 +45,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   /// Construct with n columns
   /** \sa alloc() **/
   UCL_D_Vec(const size_t n, UCL_Device &device,
-            const enum UCL_MEMOPT kind=UCL_READ_WRITE) : 
+            const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
     _cols(0) { alloc(n,device,kind); }
 
   /// Set up host vector with 'cols' columns and reserve memory
@@ -58,7 +58,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   template <class mat_type>
   inline int alloc(const size_t cols, mat_type &cq,
                    const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
-                        
+
     clear();
 
     _row_bytes=cols*sizeof(numtyp);
@@ -82,8 +82,8 @@ class UCL_D_Vec : public UCL_BaseMat {
     #ifdef _OCL_MAT
     _offset=0;
     #endif
-    return err; 
-  }    
+    return err;
+  }
 
   /// Set up host vector with 'cols' columns and reserve memory
   /** The kind parameter controls memory optimizations as follows:
@@ -116,7 +116,7 @@ class UCL_D_Vec : public UCL_BaseMat {
     #ifdef _OCL_MAT
     _offset=0;
     #endif
-    return err; 
+    return err;
   }
 
   /// Do not allocate memory, instead use an existing allocation from Geryon
@@ -142,18 +142,18 @@ class UCL_D_Vec : public UCL_BaseMat {
     #else
     _device_view(&_array,input.begin());
     #endif
-    
+
     #ifndef _UCL_DEVICE_PTR_MAT
     _end=_array+_cols;
     #endif
   }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * \param stride Number of _elements_ between the start of each row **/
   template <class ucl_type>
   inline void view(ucl_type &input, const size_t rows, const size_t cols,
                    const size_t stride) { view(input,rows,cols); }
@@ -162,24 +162,24 @@ class UCL_D_Vec : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view **/
   template <class ucl_type>
   inline void view(ucl_type &input, const size_t cols)
     { view(input,1,cols); }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view **/
   template <class ucl_type>
-  inline void view(ucl_type &input) 
+  inline void view(ucl_type &input)
     { view(input,input.rows()*input.row_size()); }
-  
+
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
@@ -205,15 +205,15 @@ class UCL_D_Vec : public UCL_BaseMat {
     CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
     #endif
   }
-  
+
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * \param stride Number of _elements_ between the start of each row **/
   template <class ptr_type>
   inline void view(ptr_type input, const size_t rows, const size_t cols,
-                   const size_t stride, UCL_Device &dev) 
+                   const size_t stride, UCL_Device &dev)
     { view(input,rows,cols,stride); }
 
   /// Do not allocate memory, instead use an existing allocation
@@ -223,7 +223,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   template <class ptr_type>
   inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
     { view(input,1,cols,dev); }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
@@ -248,45 +248,45 @@ class UCL_D_Vec : public UCL_BaseMat {
     #else
     _device_view(&_array,input.begin(),offset,sizeof(numtyp));
     #endif
-    
+
     #ifndef _UCL_DEVICE_PTR_MAT
     _end=_array+_cols;
     #endif
   }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * \param stride Number of _elements_ between the start of each row **/
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
-                          const size_t cols, const size_t stride) 
+                          const size_t cols, const size_t stride)
     { view_offset(offset,input,rows,cols); }
 
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view **/
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
     { view_offset(offset,input,1,cols); }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
     * - If a matrix is used a input, all elements (including padding)
     *   will be used for view **/
   template <class ucl_type>
-  inline void view_offset(const size_t offset, ucl_type &input) 
+  inline void view_offset(const size_t offset, ucl_type &input)
     { view_offset(offset,input,input.rows()*input.row_size()-offset); }
-  
+
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
@@ -302,7 +302,7 @@ class UCL_D_Vec : public UCL_BaseMat {
     _cols=cols;
     _row_bytes=_cols*sizeof(numtyp);
     this->_cq=dev.cq();
-    
+
     #ifdef _OCL_MAT
     _array=input;
     _offset=offset;
@@ -315,20 +315,20 @@ class UCL_D_Vec : public UCL_BaseMat {
     _array=input+offset;
     #endif
     #endif
-    
+
     #ifndef _UCL_DEVICE_PTR_MAT
     _end=_array+_cols;
     #endif
   }
-  
+
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * \param stride Number of _elements_ between the start of each row **/
   template <class ptr_type>
   inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
-                          const size_t cols,const size_t stride,UCL_Device &dev) 
+                          const size_t cols,const size_t stride,UCL_Device &dev)
     { view_offset(offset,input,rows,cols,stride); }
 
   /// Do not allocate memory, instead use an existing allocation
@@ -336,12 +336,12 @@ class UCL_D_Vec : public UCL_BaseMat {
     * - The view does not prevent the memory from being freed by the
     *   allocating container when using CUDA APIs **/
   template <class ptr_type>
-  inline void view_offset(const size_t offset, ptr_type input, 
+  inline void view_offset(const size_t offset, ptr_type input,
                           const size_t cols, UCL_Device &dev)
     { view_offset(offset,input,1,cols,dev); }
-  
+
   /// Free memory and set size to 0
-  inline void clear() 
+  inline void clear()
     { _device_free(*this); _cols=0; _kind=UCL_VIEW;  }
 
   /// Resize the allocation to contain cols elements
@@ -369,9 +369,9 @@ class UCL_D_Vec : public UCL_BaseMat {
     #ifdef _OCL_MAT
     _offset=0;
     #endif
-    return err; 
+    return err;
   }
-    
+
   /// Resize (only if bigger) the allocation to contain cols elements
   /** \note Cannot be used on views **/
   inline int resize_ib(const int cols)
@@ -384,7 +384,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   /// Set each element to zero asynchronously
   inline void zero(command_queue &cq) { _device_zero(*this,row_bytes(),cq); }
   /// Set first n elements to zero asynchronously
-  inline void zero(const int n, command_queue &cq) 
+  inline void zero(const int n, command_queue &cq)
     { _device_zero(*this,n*sizeof(numtyp),cq); }
 
   #ifdef _UCL_DEVICE_PTR_MAT
@@ -402,7 +402,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   /// For CUDA-RT, get device pointer to one past last element
   inline numtyp * end() const { return _end; }
   #endif
-  
+
   #ifdef _UCL_DEVICE_PTR_MAT
   /// Returns an API specific device pointer
   /** - For OpenCL, returns a &cl_mem object
@@ -427,10 +427,10 @@ class UCL_D_Vec : public UCL_BaseMat {
   inline const numtyp ** cbegin() const { return &_array; }
   /// For CUDA-RT, allocate row vector and bind texture
   inline void safe_alloc(const size_t cols, UCL_Device &dev,
-                         textureReference *t) 
+                         textureReference *t)
     { alloc(cols,dev); assign_texture(t); bind(); }
   /// For CUDA-RT, assign a texture to matrix
-  inline void assign_texture(textureReference *t) { _tex_ptr=t; }  
+  inline void assign_texture(textureReference *t) { _tex_ptr=t; }
   /// For CUDA-RT, bind to texture
   inline void bind() {
     cuda_gb_get_channel<numtyp>(_channel);
@@ -456,7 +456,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   inline size_t row_bytes() const { return _row_bytes; }
   /// Get the size in bytes of 1 element
   inline int element_size() const { return sizeof(numtyp); }
-  
+
   #ifdef _OCL_MAT
   /// Return the offset (in elements) from begin() pointer where data starts
   /** \note Always 0 for host matrices and CUDA APIs **/
@@ -473,7 +473,7 @@ class UCL_D_Vec : public UCL_BaseMat {
 
  private:
   size_t _row_bytes, _row_size, _rows, _cols;
-  
+
   #ifdef _UCL_DEVICE_PTR_MAT
   device_ptr _array;
   #else
diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h
index dc6da3de0c..1df3c2de4b 100644
--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -37,21 +37,21 @@ class UCL_H_Mat : public UCL_BaseMat {
      ROW_MAJOR = 1,
      VECTOR = 0
    };
-   typedef numtyp data_type; 
-   
+   typedef numtyp data_type;
+
   UCL_H_Mat() : _cols(0) {
     #ifdef _OCL_MAT
     _carray=(cl_mem)(0);
     #endif
   }
   ~UCL_H_Mat() { _host_free(*this); }
-  
+
   /// Construct with specied number of rows and columns
   /** \sa alloc() **/
-  UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device, 
-            const enum UCL_MEMOPT kind=UCL_READ_WRITE) 
+  UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device,
+            const enum UCL_MEMOPT kind=UCL_READ_WRITE)
     { _cols=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
-  
+
   /// Set up host matrix with specied # of rows/cols and reserve memory
   /** The kind parameter controls memory pinning as follows:
     * - UCL_READ_WRITE - Specify that you will read and write from host
@@ -74,7 +74,7 @@ class UCL_H_Mat : public UCL_BaseMat {
                 << " bytes on host.\n";
       _row_bytes=0;
       UCL_GERYON_EXIT;
-      #endif 
+      #endif
       _row_bytes=0;
       return err;
     }
@@ -84,7 +84,7 @@ class UCL_H_Mat : public UCL_BaseMat {
     _kind=kind;
     _end=_array+rows*cols;
     return err;
-  }    
+  }
 
   /// Set up host matrix with specied # of rows/cols and reserve memory
   /** The kind parameter controls memory pinning as follows:
@@ -117,15 +117,15 @@ class UCL_H_Mat : public UCL_BaseMat {
     _kind=kind;
     _end=_array+rows*cols;
     return err;
-  }    
-  
+  }
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device container on the host is not supported 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device container on the host is not supported
+    * \param stride Number of _elements_ between the start of each row **/
   template <class ucl_type>
   inline void view(ucl_type &input, const size_t rows, const size_t cols,
                    const size_t stride) {
@@ -149,45 +149,45 @@ class UCL_H_Mat : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device container on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device container on the host is not supported **/
   template <class ucl_type>
-  inline void view(ucl_type &input, const size_t rows, const size_t cols) 
+  inline void view(ucl_type &input, const size_t rows, const size_t cols)
     { view(input,rows,cols,input.row_size()); }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
     * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view 
-    * - Viewing a device container on the host is not supported **/ 
+    *   will be used for view
+    * - Viewing a device container on the host is not supported **/
   template <class ucl_type>
   inline void view(ucl_type &input, const size_t cols)
     { view(input,1,cols); }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
     * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view when using CUDA APIs 
-    * - Viewing a device container on the host is not supported **/ 
+    *   will be used for view when using CUDA APIs
+    * - Viewing a device container on the host is not supported **/
   template <class ucl_type>
-  inline void view(ucl_type &input) 
+  inline void view(ucl_type &input)
     { view(input,input.rows(),input.cols()); }
-  
+
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported
+    * \param stride Number of _elements_ between the start of each row **/
   template <class ptr_type>
   inline void view(ptr_type *input, const size_t rows, const size_t cols,
-                   const size_t stride, UCL_Device &dev) { 
+                   const size_t stride, UCL_Device &dev) {
     assert(rows==1 || stride==cols);
     clear();
     _kind=UCL_VIEW;
@@ -197,40 +197,40 @@ class UCL_H_Mat : public UCL_BaseMat {
     this->_cq=dev.cq();
     _array=input;
     _end=_array+_cols;
-    
+
     #ifdef _OCL_MAT
     _host_view(*this,dev,_row_bytes*rows);
-    #endif 
+    #endif
   }
 
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported **/
   template <class ptr_type>
   inline void view(ptr_type *input, const size_t rows, const size_t cols,
                    UCL_Device &dev) { view(input,rows,cols,cols,dev); }
-  
+
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported **/
   template <class ptr_type>
   inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
     { view(input,1,cols,dev); }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device container on the host is not supported 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device container on the host is not supported
+    * \param stride Number of _elements_ between the start of each row **/
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
-                          const size_t cols, const size_t stride) { 
+                          const size_t cols, const size_t stride) {
     assert(rows==1 || stride==cols);
     clear();
     _kind=UCL_VIEW;
@@ -244,81 +244,81 @@ class UCL_H_Mat : public UCL_BaseMat {
     _host_view(*this,input,_row_bytes*_rows);
     #endif
   }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device container on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device container on the host is not supported **/
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
-                          const size_t cols) 
+                          const size_t cols)
     { view_offset(offset,input,rows,cols,input.row_size()); }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
     * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view 
-    * - Viewing a device container on the host is not supported **/ 
+    *   will be used for view
+    * - Viewing a device container on the host is not supported **/
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
     { view_offset(offset,input,1,cols); }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
     * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view 
-    * - Viewing a device container on the host is not supported **/ 
+    *   will be used for view
+    * - Viewing a device container on the host is not supported **/
   template <class ucl_type>
-  inline void view_offset(const size_t offset, ucl_type &input) { 
-    if (input.rows()==1) 
+  inline void view_offset(const size_t offset, ucl_type &input) {
+    if (input.rows()==1)
       view_offset(offset,input,1,input.cols()-offset);
-    else 
+    else
       view_offset(offset,input,input.rows()-offset/input.row_size(),
                   input.cols());
   }
-  
+
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
-    * - Viewing a device pointer on the host is not supported **/ 
+    *   allocating container
+    * - Viewing a device pointer on the host is not supported **/
   template <class ptr_type>
   inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
                           const size_t cols, UCL_Device &dev)
     { view(input+offset,rows,cols,dev); }
-  
+
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported
+    * \param stride Number of _elements_ between the start of each row **/
   template <class ptr_type>
   inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
-                          const size_t cols,const size_t stride,UCL_Device &dev) 
+                          const size_t cols,const size_t stride,UCL_Device &dev)
     { view(input+offset,rows,cols,stride,dev); }
 
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported **/
   template <class ptr_type>
-  inline void view_offset(const size_t offset, ptr_type *input, 
+  inline void view_offset(const size_t offset, ptr_type *input,
                           const size_t cols, UCL_Device &dev)
     { view(input+offset,1,cols,dev); }
-  
+
   /// Free memory and set size to 0
-  inline void clear() 
-    { _host_free(*this); _cols=0; _kind=UCL_VIEW; } 
+  inline void clear()
+    { _host_free(*this); _cols=0; _kind=UCL_VIEW; }
 
   /// Resize the allocation to rows x cols elements
   /** \note Cannot be used on views **/
@@ -333,7 +333,7 @@ class UCL_H_Mat : public UCL_BaseMat {
                 << " bytes on host.\n";
       _row_bytes=0;
       UCL_GERYON_EXIT;
-      #endif 
+      #endif
       _row_bytes=0;
       return err;
     }
@@ -347,7 +347,7 @@ class UCL_H_Mat : public UCL_BaseMat {
   /// Resize (only if bigger) the allocation to contain rows x cols elements
   /** \note Cannot be used on views **/
   inline int resize_ib(const int rows, const int cols)
-    { if (cols>_cols || rows>_rows) return resize(rows,cols); 
+    { if (cols>_cols || rows>_rows) return resize(rows,cols);
       else return UCL_SUCCESS; }
 
   /// Set each element to zero
@@ -376,21 +376,21 @@ class UCL_H_Mat : public UCL_BaseMat {
   inline size_t row_bytes() const { return _row_bytes; }
   /// Get the size in bytes of 1 element
   inline int element_size() const { return sizeof(numtyp); }
-    
+
   /// Get element at index i
   inline numtyp & operator[](const int i) { return _array[i]; }
   /// Get element at index i
   inline const numtyp & operator[](const int i) const { return _array[i]; }
-  /// 2D access (row should always be 0) 
-  inline numtyp & operator()(const int row, const int col) 
+  /// 2D access (row should always be 0)
+  inline numtyp & operator()(const int row, const int col)
     { return _array[row*_cols+col]; }
-  /// 2D access (row should always be 0) 
+  /// 2D access (row should always be 0)
   inline const numtyp & operator()(const int row, const int col) const
     { return _array[row*_cols+col]; }
-  
+
   /// Returns pointer to memory pointer for allocation on host
   inline numtyp ** host_ptr() { return &_array; }
-  
+
   /// Return the offset (in elements) from begin() pointer where data starts
   /** \note Always 0 for host matrices and CUDA APIs **/
   inline size_t offset() const { return 0; }
@@ -409,14 +409,14 @@ class UCL_H_Mat : public UCL_BaseMat {
   /// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
   inline const void ** cbegin() const { return (const void **)&_array; }
   #endif
-  
+
  private:
   numtyp *_array, *_end;
   size_t _row_bytes, _rows, _cols;
 
   #ifdef _OCL_MAT
   device_ptr _carray;
-  #endif  
+  #endif
 };
 
 #endif
diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h
index 773facdea0..a9d64349d9 100644
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -37,21 +37,21 @@ class UCL_H_Vec : public UCL_BaseMat {
      ROW_MAJOR = 1,
      VECTOR = 1
    };
-   typedef numtyp data_type; 
-   
+   typedef numtyp data_type;
+
   UCL_H_Vec() : _cols(0) {
     #ifdef _OCL_MAT
     _carray=(cl_mem)(0);
     #endif
   }
   ~UCL_H_Vec() { _host_free(*this); }
-  
+
   /// Construct with n columns
   /** \sa alloc() **/
-  UCL_H_Vec(const size_t n, UCL_Device &device, 
-            const enum UCL_MEMOPT kind=UCL_READ_WRITE) 
+  UCL_H_Vec(const size_t n, UCL_Device &device,
+            const enum UCL_MEMOPT kind=UCL_READ_WRITE)
     { _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); }
-  
+
   /// Set up host vector with 'cols' columns and reserve memory
   /** The kind parameter controls memory pinning as follows:
     * - UCL_READ_WRITE - Specify that you will read and write from host
@@ -84,7 +84,7 @@ class UCL_H_Vec : public UCL_BaseMat {
     _kind=kind;
     _end=_array+cols;
     return err;
-  }    
+  }
 
   /// Set up host vector with 'cols' columns and reserve memory
   /** The kind parameter controls memory pinning as follows:
@@ -108,7 +108,7 @@ class UCL_H_Vec : public UCL_BaseMat {
                 << " bytes on host.\n";
       _row_bytes=0;
       UCL_GERYON_EXIT;
-      #endif 
+      #endif
       _row_bytes=0;
       return err;
     }
@@ -118,13 +118,13 @@ class UCL_H_Vec : public UCL_BaseMat {
     _end=_array+cols;
     return err;
   }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device container on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device container on the host is not supported **/
   template <class ucl_type>
   inline void view(ucl_type &input, const size_t rows, const size_t cols) {
     #ifdef UCL_DEBUG
@@ -143,14 +143,14 @@ class UCL_H_Vec : public UCL_BaseMat {
     CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
     #endif
   }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
     *   allocating container when using CUDA APIs
-    * - Viewing a device container on the host is not supported 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    * - Viewing a device container on the host is not supported
+    * \param stride Number of _elements_ between the start of each row **/
   template <class ucl_type>
   inline void view(ucl_type &input, const size_t rows, const size_t cols,
                    const size_t stride) { view(input,rows,cols); }
@@ -159,31 +159,31 @@ class UCL_H_Vec : public UCL_BaseMat {
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
     * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view 
-    * - Viewing a device container on the host is not supported **/ 
+    *   will be used for view
+    * - Viewing a device container on the host is not supported **/
   template <class ucl_type>
   inline void view(ucl_type &input, const size_t cols)
     { view(input,1,cols); }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container
     * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view 
-    * - Viewing a device container on the host is not supported **/ 
+    *   will be used for view
+    * - Viewing a device container on the host is not supported **/
   template <class ucl_type>
-  inline void view(ucl_type &input) 
+  inline void view(ucl_type &input)
     { view(input,input.rows()*input.row_size()); }
-  
+
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported **/
   template <class ptr_type>
   inline void view(ptr_type *input, const size_t rows, const size_t cols,
                    UCL_Device &dev) {
@@ -197,38 +197,38 @@ class UCL_H_Vec : public UCL_BaseMat {
     this->_cq=dev.cq();
     _array=input;
     _end=_array+_cols;
-    
+
     #ifdef _OCL_MAT
     _host_view(*this,dev,_row_bytes);
-    #endif 
+    #endif
   }
-  
+
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
     *   allocating container when using CUDA APIs
-    * - Viewing a device pointer on the host is not supported 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    * - Viewing a device pointer on the host is not supported
+    * \param stride Number of _elements_ between the start of each row **/
   template <class ptr_type>
   inline void view(ptr_type *input, const size_t rows, const size_t cols,
-                   const size_t stride, UCL_Device &dev) 
+                   const size_t stride, UCL_Device &dev)
     { view(input,rows,cols,stride); }
 
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
     *   allocating container when using CUDA APIs
-    * - Viewing a device pointer on the host is not supported **/ 
+    * - Viewing a device pointer on the host is not supported **/
   template <class ptr_type>
   inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
     { view(input,1,cols,dev); }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
     *   allocating container when using CUDA APIs
-    * - Viewing a device container on the host is not supported **/ 
+    * - Viewing a device container on the host is not supported **/
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
                           const size_t cols) {
@@ -246,76 +246,76 @@ class UCL_H_Vec : public UCL_BaseMat {
     _host_view(*this,input,_row_bytes);
     #endif
   }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device container on the host is not supported 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device container on the host is not supported
+    * \param stride Number of _elements_ between the start of each row **/
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
-                          const size_t cols, const size_t stride) 
+                          const size_t cols, const size_t stride)
     { view_offset(offset,input,rows,cols); }
 
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
     * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view 
-    * - Viewing a device container on the host is not supported **/ 
+    *   will be used for view
+    * - Viewing a device container on the host is not supported **/
   template <class ucl_type>
   inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
     { view_offset(offset,input,1,cols); }
-  
+
   /// Do not allocate memory, instead use an existing allocation from Geryon
   /** This function must be passed a Geryon vector or matrix container.
     * No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
     * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view 
-    * - Viewing a device container on the host is not supported **/ 
+    *   will be used for view
+    * - Viewing a device container on the host is not supported **/
   template <class ucl_type>
-  inline void view_offset(const size_t offset, ucl_type &input) 
+  inline void view_offset(const size_t offset, ucl_type &input)
     { view_offset(offset,input,input.rows()*input.row_size()-offset); }
-  
+
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported **/
   template <class ptr_type>
   inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
                           const size_t cols, UCL_Device &dev)
     { view(input+offset,rows,cols,dev); }
-  
+
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported
+    * \param stride Number of _elements_ between the start of each row **/
   template <class ptr_type>
   inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
-                          const size_t cols,const size_t stride,UCL_Device &dev) 
+                          const size_t cols,const size_t stride,UCL_Device &dev)
     { view(input+offset,rows,cols,stride,dev); }
 
   /// Do not allocate memory, instead use an existing allocation
   /** - No memory is freed when the object is destructed.
     * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported **/
   template <class ptr_type>
-  inline void view_offset(const size_t offset, ptr_type *input, 
+  inline void view_offset(const size_t offset, ptr_type *input,
                           const size_t cols, UCL_Device &dev)
     { view(input+offset,1,cols,dev); }
-  
+
   /// Free memory and set size to 0
-  inline void clear() 
+  inline void clear()
     { _host_free(*this); _kind=UCL_VIEW; _cols=0; }
 
   /// Resize the allocation to contain cols elements
@@ -324,7 +324,7 @@ class UCL_H_Vec : public UCL_BaseMat {
     assert(_kind!=UCL_VIEW);
     _row_bytes=cols*sizeof(numtyp);
     int err=_host_resize(*this,_row_bytes);
-    
+
     if (err!=UCL_SUCCESS) {
       #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate " << _row_bytes
@@ -340,7 +340,7 @@ class UCL_H_Vec : public UCL_BaseMat {
     _end=_array+cols;
     return err;
   }
-    
+
   /// Resize (only if bigger) the allocation to contain cols elements
   /** \note Cannot be used on views **/
   inline int resize_ib(const int cols)
@@ -348,7 +348,7 @@ class UCL_H_Vec : public UCL_BaseMat {
 
   /// Set each element to zero
   inline void zero() { _host_zero(_array,row_bytes()); }
-  
+
   /// Set first n elements to zero
   inline void zero(const int n) { _host_zero(_array,n*sizeof(numtyp)); }
 
@@ -373,35 +373,35 @@ class UCL_H_Vec : public UCL_BaseMat {
   inline size_t row_bytes() const { return _row_bytes; }
   /// Get the size in bytes of 1 element
   inline int element_size() const { return sizeof(numtyp); }
-    
+
   /// Get element at index i
   inline numtyp & operator[](const int i) { return _array[i]; }
   /// Get element at index i
   inline const numtyp & operator[](const int i) const { return _array[i]; }
-  /// 2D access (row should always be 0) 
-  inline numtyp & operator()(const int row, const int col) 
+  /// 2D access (row should always be 0)
+  inline numtyp & operator()(const int row, const int col)
     { return _array[col]; }
-  /// 2D access (row should always be 0) 
+  /// 2D access (row should always be 0)
   inline const numtyp & operator()(const int row, const int col) const
     { return _array[col]; }
-  
+
   /// Returns pointer to memory pointer for allocation on host
   inline numtyp ** host_ptr() { return &_array; }
-  
+
   /// Return the offset (in elements) from begin() pointer where data starts
   /** \note Always 0 for host matrices and CUDA APIs **/
   inline size_t offset() const { return 0; }
   /// Return the offset (in bytes) from begin() pointer where data starts
   /** \note Always 0 for host matrices and CUDA APIs **/
   inline size_t byteoff() const { return 0; }
-  
+
   #ifdef _OCL_MAT
   /// For OpenCL, returns a reference to the cl_mem object
   inline device_ptr & cbegin() { return _carray; }
   /// For OpenCL, returns a reference to the cl_mem object
   inline const device_ptr & cbegin() const { return _carray; }
   #endif
-  
+
  private:
   numtyp *_array, *_end;
   size_t _row_bytes, _cols;
diff --git a/lib/gpu/geryon/ucl_matrix.h b/lib/gpu/geryon/ucl_matrix.h
index 301325b454..b93d1c7f68 100644
--- a/lib/gpu/geryon/ucl_matrix.h
+++ b/lib/gpu/geryon/ucl_matrix.h
@@ -34,25 +34,25 @@ class UCL_Matrix {
     ROW_MAJOR = 1,
     VECTOR = 0
   };
-  typedef hosttype data_type; 
+  typedef hosttype data_type;
 
   /// Host Allocation
   UCL_H_Mat<hosttype> host;
-  
+
   /// Device Allocation
   UCL_D_Mat<devtype> device;
 
   UCL_Matrix() { }
   ~UCL_Matrix() { }
-  
+
   /// Construct with specied number of rows and columns
   /** \sa alloc() **/
-  UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc, 
+  UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc,
              const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
              const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
     { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
         alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
-  
+
   /// Set up host matrix with specied # of rows/cols and reserve memory
   /** The kind1 parameter controls memory access from the host
     * - UCL_READ_WRITE - Specify that you will read and write from host
@@ -74,7 +74,7 @@ class UCL_Matrix {
                    const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
     { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
         alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); }
-  
+
   /// Set up host matrix with specied # of rows/cols and reserve memory
   /** The kind1 parameter controls memory access from the host
     * - UCL_READ_WRITE - Specify that you will read and write from host
@@ -92,9 +92,9 @@ class UCL_Matrix {
                    const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
     { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
         alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
-  
+
   /// Free memory and set size to 0
-  inline void clear() 
+  inline void clear()
     { host.clear(); device.clear(); }
 
   /// Resize the allocation to contain cols elements
@@ -106,10 +106,10 @@ class UCL_Matrix {
     return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
       dev_resize(device,host,_buffer,rows,cols);
   }
-    
+
   /// Resize (only if bigger) the allocation to contain cols elements
   inline int resize_ib(const int new_rows, const int new_cols)
-    { if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols); 
+    { if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols);
       else return UCL_SUCCESS; }
 
   /// Set each element to zero (asynchronously on device)
@@ -118,14 +118,14 @@ class UCL_Matrix {
   inline void zero(const int n) { zero(n,cq()); }
   /// Set each element to zero (asynchronously on device)
   inline void zero(command_queue &cq) {
-    host.zero(); 
+    host.zero();
     if (device.kind()!=UCL_VIEW) device.zero(cq);
     else if (_buffer.numel()>0) _buffer.zero();
   }
   /// Set first n elements to zero (asynchronously on device)
-  inline void zero(const int n, command_queue &cq) { 
-    host.zero(n); 
-    if (device.kind()!=UCL_VIEW) device.zero(n,cq); 
+  inline void zero(const int n, command_queue &cq) {
+    host.zero(n);
+    if (device.kind()!=UCL_VIEW) device.zero(n,cq);
     else if (_buffer.numel()>0) _buffer.zero();
   }
 
@@ -136,26 +136,26 @@ class UCL_Matrix {
   /// Get the number of columns
   inline size_t cols() const { return host.cols(); }
   /// Get the memory usage (bytes) of the s-object (including any buffers)
-  inline size_t host_mem_usage() 
+  inline size_t host_mem_usage()
     { return host.row_bytes()*host.rows()+_buffer.row_bytes()*_buffer.rows(); }
   /// Get the memory usage (bytes) of the s-object (including any buffers)
-  inline size_t device_mem_usage() 
+  inline size_t device_mem_usage()
     { return device.row_bytes()*device.rows(); }
-    
+
   /// Get element at index i
   inline hosttype & operator[](const int i) { return host[i]; }
   /// Get element at index i
   inline const hosttype & operator[](const int i) const { return host[i]; }
-  /// 2D access (row should always be 0) 
-  inline hosttype & operator()(const int row, const int col) 
+  /// 2D access (row should always be 0)
+  inline hosttype & operator()(const int row, const int col)
     { return host(row,col); }
-  /// 2D access (row should always be 0) 
+  /// 2D access (row should always be 0)
   inline const hosttype & operator()(const int row, const int col) const
     { return host(row,col); }
-  
+
   /// Returns pointer to memory pointer for allocation on host
   inline hosttype ** host_ptr() { return host.host_ptr(); }
-  
+
   /// Return the default command queue/stream associated with this data
   inline command_queue & cq() { return host.cq(); }
   /// Change the default command queue associated with this data
@@ -172,7 +172,7 @@ class UCL_Matrix {
 
 
   /// Update the allocation on the host asynchronously
-  inline void update_host() 
+  inline void update_host()
     { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
         copy(host,device,_buffer,true); }
   /// Update the allocation on the host (true for asynchronous copy)
@@ -202,7 +202,7 @@ class UCL_Matrix {
 
 
   /// Update the allocation on the device asynchronously
-  inline void update_device() 
+  inline void update_device()
     { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
         copy(device,host,_buffer,true); }
   /// Update the allocation on the device (true for asynchronous copy)
diff --git a/lib/gpu/geryon/ucl_nv_kernel.h b/lib/gpu/geryon/ucl_nv_kernel.h
index bdba8ff7ae..437631ec3a 100644
--- a/lib/gpu/geryon/ucl_nv_kernel.h
+++ b/lib/gpu/geryon/ucl_nv_kernel.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -53,9 +53,9 @@ typedef struct _double4 double4;
 #define BLOCK_SIZE_Y blockDim.y
 #define __kernel extern "C" __global__
 #define __local __shared__
-#define __global  
+#define __global
 #define atom_add atomicAdd
-#define ucl_inline static __inline__ __device__ 
+#define ucl_inline static __inline__ __device__
 
 #endif
 
diff --git a/lib/gpu/geryon/ucl_print.h b/lib/gpu/geryon/ucl_print.h
index 87b3d3d7ff..98ae8a8c06 100644
--- a/lib/gpu/geryon/ucl_print.h
+++ b/lib/gpu/geryon/ucl_print.h
@@ -17,10 +17,10 @@
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
-   
+
 // Only allow this file to be included by nvc_memory.h and ocl_memory.h
 #ifdef UCL_PRINT_ALLOW
 
@@ -40,7 +40,7 @@ template <> struct _ucl_print<1> {
   }
   template <class mat_type>
   static inline void p(mat_type &mat, const size_t rows, const size_t cols,
-                       std::ostream &out, const std::string delim, 
+                       std::ostream &out, const std::string delim,
                        const std::string row_delim) {
     int offset=0;
     int row_size=cols;
@@ -58,12 +58,12 @@ template <> struct _ucl_print<1> {
   }
   template <class mat_type>
   static inline void p(const mat_type &mat,const size_t rows,const size_t cols,
-                       std::ostream &out,const std::string delim, 
+                       std::ostream &out,const std::string delim,
                        const std::string row_delim, UCL_Device &dev) {
-    p(mat,rows,cols,out,delim,row_delim);                       
+    p(mat,rows,cols,out,delim,row_delim);
   }
 };
-      
+
 template <int mem> struct _ucl_print {
   template <class mat_type>
   static inline void p(mat_type &mat, const size_t n, std::ostream &out,
@@ -83,7 +83,7 @@ template <int mem> struct _ucl_print {
   }
   template <class mat_type>
   static inline void p(mat_type &mat, const size_t rows, const size_t cols,
-                       std::ostream &out, const std::string delim, 
+                       std::ostream &out, const std::string delim,
                        const std::string row_delim) {
     UCL_H_Vec<typename mat_type::data_type> temp;
     temp.alloc(mat.rows()*mat.cols(),mat);
@@ -91,12 +91,12 @@ template <int mem> struct _ucl_print {
       ucl_copy(temp,mat,rows*cols,false);
     else
       ucl_copy(temp,mat,rows,cols,false);
-    _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);      
+    _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);
   }
   template <class mat_type>
-  static inline void p(const mat_type &mat, const size_t rows, 
+  static inline void p(const mat_type &mat, const size_t rows,
                        const size_t cols,std::ostream &out,
-                       const std::string delim, 
+                       const std::string delim,
                        const std::string row_delim, UCL_Device &dev) {
     UCL_H_Vec<typename mat_type::data_type> temp;
     temp.alloc(mat.rows()*mat.cols(),dev);
@@ -104,9 +104,9 @@ template <int mem> struct _ucl_print {
       ucl_copy(temp,mat,rows*cols,false);
     else
       ucl_copy(temp,mat,rows,cols,false);
-    _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);      
+    _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);
   }
-};                   
+};
 
 // -------------------------------------------------------------------------
 // - Non-const routines that do not require a device object
@@ -123,13 +123,13 @@ inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out,
   }
   _ucl_print<mat_type::MEM_TYPE>::p(mat,n,out,delim);
 }
-  
+
 /// Outputs n elements of mat delimited by a space
 template <class mat_type>
 inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out) {
   ucl_print(mat,n,out," ");
 }
-  
+
 /// Outputs n elements of mat delimited by a space to standard out
 template <class mat_type>
 inline void ucl_print(mat_type &mat, const size_t n) {
@@ -139,8 +139,8 @@ inline void ucl_print(mat_type &mat, const size_t n) {
 /// Outputs upper left rows and cols of mat delimited by the string delim
 template <class mat_type>
 inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
-                      std::ostream &out, const std::string delim, 
-                      const std::string row_delim) {                      
+                      std::ostream &out, const std::string delim,
+                      const std::string row_delim) {
   if (rows*cols>mat.numel()) {
     std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix "
               << "that only has " << mat.numel() << " elements.";
@@ -148,17 +148,17 @@ inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
   }
   _ucl_print<mat_type::MEM_TYPE>::p(mat,rows,cols,out,delim,row_delim);
 }
-  
+
 /// Outputs upper left rows and cols of mat delimited by a space
 template <class mat_type>
 inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
                       std::ostream &out) {
   ucl_print(mat,rows,cols,out," ","\n");
 }
-  
+
 /// Outputs  upper left rows and cols of mat delimited by a space to std out
 template <class mat_type>
-inline void ucl_print(mat_type &mat, const size_t rows, 
+inline void ucl_print(mat_type &mat, const size_t rows,
                       const size_t cols) {
   ucl_print(mat,rows,cols,std::cout," ","\n");
 }
@@ -177,7 +177,7 @@ inline void ucl_print(mat_type &mat, std::ostream &out) {
   else
     ucl_print(mat,mat.rows(),mat.cols(),out," ","\n");
 }
-  
+
 // -------------------------------------------------------------------------
 // - Const routines that do not require a device object
 // -------------------------------------------------------------------------
@@ -193,14 +193,14 @@ inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out,
   }
   _ucl_print<mat_type::MEM_TYPE>::p(mat,n,out,delim,dev);
 }
-  
+
 /// Outputs n elements of mat delimited by a space
 template <class mat_type>
 inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out,
                       UCL_Device &dev) {
   ucl_print(mat,n,out," ",dev);
 }
-  
+
 /// Outputs n elements of mat delimited by a space to standard out
 template <class mat_type>
 inline void ucl_print(const mat_type &mat, const size_t n,
@@ -211,7 +211,7 @@ inline void ucl_print(const mat_type &mat, const size_t n,
 /// Outputs upper left rows and cols of mat delimited by the string delim
 template <class mat_type>
 inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
-                      std::ostream &out, const std::string delim, 
+                      std::ostream &out, const std::string delim,
                       const std::string row_delim, UCL_Device &dev) {
   if (rows*cols>mat.numel()) {
     std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix "
@@ -220,17 +220,17 @@ inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
   }
   _ucl_print<mat_type::MEM_TYPE>::p(mat,rows,cols,out,delim,row_delim,dev);
 }
-  
+
 /// Outputs upper left rows and cols of mat delimited by a space
 template <class mat_type>
 inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
                       std::ostream &out, UCL_Device &dev) {
   ucl_print(mat,rows,cols,out," ","\n",dev);
 }
-  
+
 /// Outputs  upper left rows and cols of mat delimited by a space to std out
 template <class mat_type>
-inline void ucl_print(const mat_type &mat, const size_t rows, 
+inline void ucl_print(const mat_type &mat, const size_t rows,
                       const size_t cols, UCL_Device &dev) {
   ucl_print(mat,rows,cols,std::cout," ","\n",dev);
 }
@@ -256,27 +256,27 @@ inline void ucl_print(const mat_type &mat, std::ostream &out, UCL_Device &dev) {
 
 template <class numtyp>
 inline std::ostream & operator << (std::ostream &out, UCL_H_Vec<numtyp> &mat)
-  { ucl_print(mat,out); return out; } 
+  { ucl_print(mat,out); return out; }
 
 template <class numtyp>
 inline std::ostream & operator << (std::ostream &out, UCL_H_Mat<numtyp> &mat)
-  { ucl_print(mat,out); return out; } 
+  { ucl_print(mat,out); return out; }
 
 template <class numtyp>
 inline std::ostream & operator << (std::ostream &out, UCL_D_Vec<numtyp> &mat)
-  { ucl_print(mat,out); return out; } 
+  { ucl_print(mat,out); return out; }
 
 template <class numtyp>
 inline std::ostream & operator << (std::ostream &out, UCL_D_Mat<numtyp> &mat)
-  { ucl_print(mat,out); return out; } 
+  { ucl_print(mat,out); return out; }
 
 
 template <class t1, class t2>
 inline std::ostream & operator << (std::ostream &out, UCL_Vector<t1,t2> &mat)
-  { ucl_print(mat.host,out); return out; } 
+  { ucl_print(mat.host,out); return out; }
 
 template <class t1, class t2>
 inline std::ostream & operator << (std::ostream &out, UCL_Matrix<t1,t2> &mat)
-  { ucl_print(mat.host,out); return out; } 
+  { ucl_print(mat.host,out); return out; }
 
 #endif
diff --git a/lib/gpu/geryon/ucl_s_obj_help.h b/lib/gpu/geryon/ucl_s_obj_help.h
index 0b8c0251c1..a10f3cdb3f 100644
--- a/lib/gpu/geryon/ucl_s_obj_help.h
+++ b/lib/gpu/geryon/ucl_s_obj_help.h
@@ -3,7 +3,7 @@
                              -------------------
                                W. Michael Brown
 
-  Helper routines for allocating memory for s-objects and performing 
+  Helper routines for allocating memory for s-objects and performing
   host/device updates. (Different routines depending on whether the
   same type is used on the host and device).
 
@@ -141,29 +141,29 @@ template <> struct _ucl_s_obj_help<1> {
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, 
+  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
                           const bool async) {
     ucl_copy(dst,src,cols,async);
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, 
+  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
                           command_queue &cq) {
     ucl_copy(dst,src,cols,cq);
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, 
+  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
                           t3 &buffer, const bool async) {
     ucl_copy(dst,src,rows,cols,async);
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, 
+  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
                           t3 &buffer, command_queue &cq) {
     ucl_copy(dst,src,rows,cols,cq);
   }
-  
+
   template <class t1, class t2, class t3>
   static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
     if (device.kind()==UCL_VIEW) {
@@ -181,7 +181,7 @@ template <> struct _ucl_s_obj_help<1> {
   }
 
   template <class t1, class t2, class t3>
-  static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows, 
+  static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows,
                                const int cols) {
     if (device.kind()==UCL_VIEW) {
       device.view(host);
@@ -255,7 +255,7 @@ template <int st> struct _ucl_s_obj_help {
       e1=_buffer.alloc(cols,cq,kind1);
       if (e1!=UCL_SUCCESS)
         return e1;
-      return device.alloc(cols,cq,kind2); 
+      return device.alloc(cols,cq,kind2);
     }
   }
 
@@ -314,7 +314,7 @@ template <int st> struct _ucl_s_obj_help {
       e1=_buffer.alloc(rows,cols,cq,kind1);
       if (e1!=UCL_SUCCESS)
         return e1;
-      return device.alloc(rows,cols,cq,kind2); 
+      return device.alloc(rows,cols,cq,kind2);
     }
   }
 
@@ -329,25 +329,25 @@ template <int st> struct _ucl_s_obj_help {
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, 
+  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
                           const bool async) {
     ucl_cast_copy(dst,src,cols,buffer,async);
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, 
+  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
                           command_queue &cq) {
     ucl_cast_copy(dst,src,cols,buffer,cq);
   }
-  
+
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, 
+  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
                           t3 &buffer, const bool async) {
     ucl_cast_copy(dst,src,rows,cols,buffer,async);
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, 
+  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
                           t3 &buffer, command_queue &cq) {
     ucl_cast_copy(dst,src,rows,cols,buffer,cq);
   }
@@ -373,7 +373,7 @@ template <int st> struct _ucl_s_obj_help {
   }
 
   template <class t1, class t2, class t3>
-  static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows, 
+  static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows,
                                const int cols) {
     int err=buff.resize(rows,cols);
     if (err!=UCL_SUCCESS)
diff --git a/lib/gpu/geryon/ucl_types.h b/lib/gpu/geryon/ucl_types.h
index 46be4bcb06..cb3dce8430 100644
--- a/lib/gpu/geryon/ucl_types.h
+++ b/lib/gpu/geryon/ucl_types.h
@@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
@@ -26,65 +26,65 @@
 
 // Assign an integer id based on the data type: (int, float, double, etc)
 template <class eltype> struct _UCL_DATA_ID;
-template <> struct _UCL_DATA_ID<double> { 
+template <> struct _UCL_DATA_ID<double> {
   enum { id=1 };
-  static inline const char * name() { return "double"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=double"; }  
+  static inline const char * name() { return "double"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=double"; }
 };
-template <> struct _UCL_DATA_ID<float> { 
+template <> struct _UCL_DATA_ID<float> {
   enum { id=2 };
-  static inline const char * name() { return "float"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=float"; }  
+  static inline const char * name() { return "float"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=float"; }
 };
-template <> struct _UCL_DATA_ID<unsigned> { 
+template <> struct _UCL_DATA_ID<unsigned> {
   enum { id=3 };
-  static inline const char * name() { return "unsigned"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned"; }  
+  static inline const char * name() { return "unsigned"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned"; }
 };
-template <> struct _UCL_DATA_ID<int> { 
+template <> struct _UCL_DATA_ID<int> {
   enum { id=4 };
-  static inline const char * name() { return "int"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=int"; }  
+  static inline const char * name() { return "int"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=int"; }
 };
-template <> struct _UCL_DATA_ID<char> { 
+template <> struct _UCL_DATA_ID<char> {
   enum { id=5 };
-  static inline const char * name() { return "char"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=char"; }  
+  static inline const char * name() { return "char"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=char"; }
 };
-template <> struct _UCL_DATA_ID<unsigned char> { 
+template <> struct _UCL_DATA_ID<unsigned char> {
   enum { id=6 };
-  static inline const char * name() { return "unsigned char"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned char"; }  
+  static inline const char * name() { return "unsigned char"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned char"; }
 };
-template <> struct _UCL_DATA_ID<short> { 
+template <> struct _UCL_DATA_ID<short> {
   enum { id=7 };
-  static inline const char * name() { return "short"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=short"; }  
+  static inline const char * name() { return "short"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=short"; }
 };
-template <> struct _UCL_DATA_ID<unsigned short> { 
+template <> struct _UCL_DATA_ID<unsigned short> {
   enum { id=8 };
-  static inline const char * name() { return "unsigned short"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned short"; }  
+  static inline const char * name() { return "unsigned short"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned short"; }
 };
-template <> struct _UCL_DATA_ID<long> { 
+template <> struct _UCL_DATA_ID<long> {
   enum { id=9 };
-  static inline const char * name() { return "long"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=long"; }  
+  static inline const char * name() { return "long"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=long"; }
 };
-template <> struct _UCL_DATA_ID<unsigned long> { 
+template <> struct _UCL_DATA_ID<unsigned long> {
   enum { id=10 };
-  static inline const char * name() { return "unsigned long"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned long"; }  
+  static inline const char * name() { return "unsigned long"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned long"; }
 };
-template <> struct _UCL_DATA_ID<long double> { 
+template <> struct _UCL_DATA_ID<long double> {
   enum { id=11 };
-  static inline const char * name() { return "long double"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=long double"; }  
+  static inline const char * name() { return "long double"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=long double"; }
 };
-template <class eltype> struct _UCL_DATA_ID { 
+template <class eltype> struct _UCL_DATA_ID {
   enum { id=0 };
-  static inline const char * name() { return "error_type"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=error_type"; }  
+  static inline const char * name() { return "error_type"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=error_type"; }
 };
 
 // Host memory allocation types
@@ -97,7 +97,7 @@ enum UCL_MEMOPT {
   UCL_NOT_SPECIFIED
 };
 
-enum UCL_DEVICE_TYPE { 
+enum UCL_DEVICE_TYPE {
   UCL_DEFAULT,        ///< Unknown device type
   UCL_CPU,            ///< Device is a CPU
   UCL_GPU,            ///< Device is a GPU
@@ -111,7 +111,7 @@ enum UCL_ERROR_FLAG {
   UCL_FUNCTION_NOT_FOUND, ///< Kernel function not found
   UCL_COMPILE_ERROR,      ///< Error compiling kernel
   UCL_MEMORY_ERROR
-};  
+};
 
 template <class numtyp>
 const char * ucl_template_name() { return _UCL_DATA_ID<numtyp>::name(); }
diff --git a/lib/gpu/geryon/ucl_vector.h b/lib/gpu/geryon/ucl_vector.h
index 89f1528969..7fe2604de6 100644
--- a/lib/gpu/geryon/ucl_vector.h
+++ b/lib/gpu/geryon/ucl_vector.h
@@ -34,25 +34,25 @@ class UCL_Vector {
     ROW_MAJOR = 1,
     VECTOR = 1
   };
-  typedef hosttype data_type; 
+  typedef hosttype data_type;
 
   /// Host Allocation
   UCL_H_Vec<hosttype> host;
-  
+
   /// Device Allocation
   UCL_D_Vec<devtype> device;
-  
+
   UCL_Vector() { }
   ~UCL_Vector() { }
 
   /// Construct with n columns
   /** \sa alloc() **/
-  UCL_Vector(const size_t cols, UCL_Device &acc, 
+  UCL_Vector(const size_t cols, UCL_Device &acc,
              const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
              const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
     { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
         alloc(host,device,_buffer,cols,acc,kind1,kind2); }
-  
+
   /// Set up the vector with 'cols' columns and reserve memory
   /** The kind1 parameter controls memory access from the host
     * - UCL_READ_WRITE - Specify that you will read and write from host
@@ -89,12 +89,12 @@ class UCL_Vector {
     * \return UCL_SUCCESS if the memory allocation is successful **/
   inline int alloc(const size_t cols, UCL_Device &acc,
                    const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
-                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE) 
+                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
     { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
         alloc(host,device,_buffer,cols,acc,kind1,kind2); }
-  
+
   /// Free memory and set size to 0
-  inline void clear() 
+  inline void clear()
     { host.clear(); device.clear(); }
 
   /// Resize the allocation to contain cols elements
@@ -106,7 +106,7 @@ class UCL_Vector {
     return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
       dev_resize(device,host,_buffer,cols);
   }
-    
+
   /// Resize (only if bigger) the allocation to contain cols elements
   inline int resize_ib(const int new_cols)
     { if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; }
@@ -117,14 +117,14 @@ class UCL_Vector {
   inline void zero(const int n) { zero(n,cq()); }
   /// Set each element to zero (asynchronously on device)
   inline void zero(command_queue &cq) {
-    host.zero(); 
+    host.zero();
     if (device.kind()!=UCL_VIEW) device.zero(cq);
     else if (_buffer.numel()>0) _buffer.zero();
   }
   /// Set first n elements to zero (asynchronously on device)
-  inline void zero(const int n, command_queue &cq) { 
-    host.zero(n); 
-    if (device.kind()!=UCL_VIEW) device.zero(n,cq); 
+  inline void zero(const int n, command_queue &cq) {
+    host.zero(n);
+    if (device.kind()!=UCL_VIEW) device.zero(n,cq);
     else if (_buffer.numel()>0) _buffer.zero();
   }
 
@@ -135,27 +135,27 @@ class UCL_Vector {
   /// Get the number of columns
   inline size_t cols() const { return host.cols(); }
   /// Get the memory usage (bytes) of the s-object (including any buffers)
-  inline size_t host_mem_usage() 
+  inline size_t host_mem_usage()
     { return host.row_bytes()+_buffer.row_bytes(); }
   /// Get the memory usage (bytes) of the s-object (including any buffers)
-  inline size_t device_mem_usage() 
+  inline size_t device_mem_usage()
     { return device.row_bytes(); }
-  
-  
+
+
   /// Get element at index i
   inline hosttype & operator[](const int i) { return host[i]; }
   /// Get element at index i
   inline const hosttype & operator[](const int i) const { return host[i]; }
-  /// 2D access (row should always be 0) 
-  inline hosttype & operator()(const int row, const int col) 
+  /// 2D access (row should always be 0)
+  inline hosttype & operator()(const int row, const int col)
     { return host[col]; }
-  /// 2D access (row should always be 0) 
+  /// 2D access (row should always be 0)
   inline const hosttype & operator()(const int row, const int col) const
     { return host[col]; }
-  
+
   /// Returns pointer to memory pointer for allocation on host
   inline hosttype ** host_ptr() { return host.host_ptr(); }
-  
+
   /// Return the default command queue/stream associated with this data
   inline command_queue & cq() { return host.cq(); }
   /// Change the default command queue associated with this data
@@ -172,7 +172,7 @@ class UCL_Vector {
 
 
   /// Update the allocation on the host asynchronously
-  inline void update_host() 
+  inline void update_host()
     { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
         copy(host,device,_buffer,true); }
   /// Update the allocation on the host (true for asynchronous copy)
@@ -202,7 +202,7 @@ class UCL_Vector {
 
 
   /// Update the allocation on the device asynchronously
-  inline void update_device() 
+  inline void update_device()
     { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
         copy(device,host,_buffer,true); }
   /// Update the allocation on the device (true for asynchronous copy)
diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp
index dd0b5d2424..bd8c7ef843 100644
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -24,7 +24,7 @@ AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false),
 }
 
 template <class numtyp, class acctyp>
-int AnswerT::bytes_per_atom() const { 
+int AnswerT::bytes_per_atom() const {
   int bytes=11*sizeof(acctyp);
   if (_rot)
     bytes+=4*sizeof(acctyp);
@@ -38,19 +38,19 @@ bool AnswerT::alloc(const int inum) {
   _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
 
   bool success=true;
-  
+
   _ans_fields=4;
   if (_rot)
     _ans_fields+=4;
-  
+
   // ---------------------------  Device allocations
   success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
                                  UCL_READ_WRITE)==UCL_SUCCESS);
   success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_READ_ONLY,
                                 UCL_READ_WRITE)==UCL_SUCCESS);
   _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
-  
-  _allocated=true;  
+
+  _allocated=true;
   return success;
 }
 
@@ -69,21 +69,21 @@ bool AnswerT::init(const int inum, const bool charge, const bool rot,
   if (_charge)
     _e_fields++;
   _ev_fields=6+_e_fields;
-    
+
   // Initialize atom and nbor data
   int ef_inum=inum;
   if (ef_inum==0)
     ef_inum=1000;
-  
+
   // Initialize timers for the selected device
   time_answer.init(*dev);
   time_answer.zero();
   _time_cast=0.0;
   _time_cpu_idle=0.0;
-  
+
   return success && alloc(ef_inum);
 }
-  
+
 template <class numtyp, class acctyp>
 bool AnswerT::add_fields(const bool charge, const bool rot) {
   bool realloc=false;
@@ -127,15 +127,15 @@ void AnswerT::clear() {
 template <class numtyp, class acctyp>
 double AnswerT::host_memory_usage() const {
   int atom_bytes=4;
-  if (_charge) 
+  if (_charge)
     atom_bytes+=1;
-  if (_rot) 
+  if (_rot)
     atom_bytes+=4;
   int ans_bytes=atom_bytes+_ev_fields;
   return ans_bytes*(_max_local)*sizeof(acctyp)+
          sizeof(Answer<numtyp,acctyp>);
 }
-  
+
 template <class numtyp, class acctyp>
 void AnswerT::copy_answers(const bool eflag, const bool vflag,
                                const bool ef_atom, const bool vf_atom) {
@@ -144,8 +144,8 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
   _vflag=vflag;
   _ef_atom=ef_atom;
   _vf_atom=vf_atom;
-    
-  int csize=_ev_fields;    
+
+  int csize=_ev_fields;
   if (!eflag)
     csize-=_e_fields;
   if (!vflag)
@@ -180,7 +180,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
     for (int i=0; i<_inum; i++)
       evdwl+=engv[i];
     if (_ef_atom)
-      if (_ilist==NULL) 
+      if (_ilist==NULL)
         for (int i=0; i<_inum; i++)
           eatom[i]+=engv[i];
       else
@@ -196,18 +196,18 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
       if (_vf_atom)
         if (_ilist==NULL) {
           int ii=0;
-          for (int i=vstart; i<iend; i++) 
+          for (int i=vstart; i<iend; i++)
             vatom[ii++][j]+=engv[i];
         } else {
           int ii=0;
-          for (int i=vstart; i<iend; i++) 
+          for (int i=vstart; i<iend; i++)
             vatom[_ilist[ii++]][j]+=engv[i];
         }
       vstart+=_inum;
       iend+=_inum;
     }
   }
-  
+
   return evdwl;
 }
 
@@ -242,8 +242,8 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
       }
     vstart=iend;
     iend+=_inum;
-  } 
-  if (_vflag) { 
+  }
+  if (_vflag) {
     for (int j=0; j<6; j++) {
       for (int i=vstart; i<iend; i++)
         virial[j]+=engv[i];
@@ -254,12 +254,12 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
         } else {
           for (int i=vstart, ii=0; i<iend; i++)
             vatom[_ilist[ii++]][j]+=engv[i];
-        }  
+        }
       vstart+=_inum;
       iend+=_inum;
     }
   }
-  
+
   return evdwl;
 }
 
diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp
index a250584dfa..222ba0525e 100644
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ AtomT::Atom() : _compiled(false),_allocated(false),
 }
 
 template <class numtyp, class acctyp>
-int AtomT::bytes_per_atom() const { 
+int AtomT::bytes_per_atom() const {
   int id_space=0;
   if (_gpu_nbor==1)
     id_space=2;
@@ -51,7 +51,7 @@ bool AtomT::alloc(const int nall) {
   _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
 
   bool success=true;
-  
+
   // Ignore host/device transfers?
   _host_view=false;
   if (dev->shared_memory() && sizeof(numtyp)==sizeof(double)) {
@@ -60,11 +60,11 @@ bool AtomT::alloc(const int nall) {
     assert(0==1);
     #endif
   }
-      
+
   // Allocate storage for CUDPP sort
   #ifdef USE_CUDPP
   if (_gpu_nbor==1) {
-    CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);  
+    CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);
     if (CUDPP_SUCCESS != result)
       return false;
   }
@@ -110,7 +110,7 @@ bool AtomT::alloc(const int nall) {
     } else {
       success=success && (host_particle_id.alloc(_max_atoms,*dev,
                                                  UCL_WRITE_ONLY)==UCL_SUCCESS);
-      success=success && 
+      success=success &&
              (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
     }
     if (_gpu_nbor==2 && _host_view)
@@ -124,8 +124,8 @@ bool AtomT::alloc(const int nall) {
   gpu_bytes+=x.device.row_bytes();
   if (gpu_bytes>_max_gpu_bytes)
     _max_gpu_bytes=gpu_bytes;
-  
-  _allocated=true;  
+
+  _allocated=true;
   return success;
 }
 
@@ -135,7 +135,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
   bool success=true;
   // Ignore host/device transfers?
   int gpu_bytes=0;
-  
+
   if (charge && _charge==false) {
     _charge=true;
     _other=true;
@@ -179,7 +179,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
     _gpu_nbor=gpu_nbor;
     #ifdef USE_CUDPP
     if (_gpu_nbor==1) {
-      CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);  
+      CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);
       if (CUDPP_SUCCESS != result)
         return false;
     }
@@ -198,9 +198,9 @@ bool AtomT::add_fields(const bool charge, const bool rot,
     } else {
       success=success && (host_particle_id.alloc(_max_atoms,*dev,
                                                  UCL_WRITE_ONLY)==UCL_SUCCESS);
-      success=success && 
+      success=success &&
              (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
-    }             
+    }
   }
 
   return success;
@@ -230,7 +230,7 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
   int ef_nall=nall;
   if (ef_nall==0)
     ef_nall=2000;
-  
+
   // Initialize timers for the selected device
   time_pos.init(*dev);
   time_q.init(*dev);
@@ -241,14 +241,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
   time_quat.zero();
   time_vel.zero();
   _time_cast=0.0;
-  
+
   #ifdef GPU_CAST
   compile_kernels(*dev);
   #endif
-  
+
   return success && alloc(ef_nall);
 }
-  
+
 template <class numtyp, class acctyp>
 void AtomT::clear_resize() {
   if (!_allocated)
@@ -274,7 +274,7 @@ void AtomT::clear_resize() {
   #ifdef USE_CUDPP
   if (_gpu_nbor==1) cudppDestroyPlan(sort_plan);
   #endif
-  
+
   if (_gpu_nbor==2) {
     host_particle_id.clear();
     host_cell_id.clear();
@@ -305,21 +305,21 @@ void AtomT::clear() {
 template <class numtyp, class acctyp>
 double AtomT::host_memory_usage() const {
   int atom_bytes=4;
-  if (_charge) 
+  if (_charge)
     atom_bytes+=1;
-  if (_rot) 
+  if (_rot)
     atom_bytes+=4;
-  if (_vel) 
+  if (_vel)
     atom_bytes+=4;
   return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
 }
-  
+
 // Sort arrays for neighbor list calculation
 template <class numtyp, class acctyp>
 void AtomT::sort_neighbor(const int num_atoms) {
   #ifdef USE_CUDPP
-  CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), 
-                                 (int *)dev_particle_id.begin(), 
+  CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(),
+                                 (int *)dev_particle_id.begin(),
                                  8*sizeof(unsigned), num_atoms);
   if (CUDPP_SUCCESS != result) {
     printf("Error in cudppSort\n");
diff --git a/lib/gpu/lal_atom.cu b/lib/gpu/lal_atom.cu
index 2a78719ffb..28ff31c566 100644
--- a/lib/gpu/lal_atom.cu
+++ b/lib/gpu/lal_atom.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -17,9 +17,9 @@
 #include "lal_preprocessor.h"
 #endif
 
-__kernel void kernel_cast_x(__global numtyp4 *restrict x_type, 
+__kernel void kernel_cast_x(__global numtyp4 *restrict x_type,
                             const __global double *restrict x,
-                            const __global int *restrict type, 
+                            const __global int *restrict type,
                             const int nall) {
   int ii=GLOBAL_ID_X;
 
diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h
index 23112fe712..1b4e17d972 100644
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -57,19 +57,19 @@ class Atom {
 
   /// Set number of local+ghost atoms for future copy operations
   inline void nall(const int n) { _nall=n; }
-  
+
   /// Memory usage per atom in this class
-  int bytes_per_atom() const; 
+  int bytes_per_atom() const;
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param rot True if atom storage needs quaternions
     * \param gpu_nbor 0 if neighboring will be performed on host
     *        gpu_nbor 1 if neighboring will be performed on device
     *        gpu_nbor 2 if binning on host and neighboring on device **/
-  bool init(const int nall, const bool charge, const bool rot, 
-            UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false, 
+  bool init(const int nall, const bool charge, const bool rot,
+            UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
             const bool vel=false);
-  
+
   /// Check if we have enough device storage and realloc if not
   /** Returns true if resized with any call during this timestep **/
   inline bool resize(const int nall, bool &success) {
@@ -81,7 +81,7 @@ class Atom {
     }
     return _resized;
   }
-  
+
   /// If already initialized by another LAMMPS style, add fields as necessary
   /** \param rot True if atom storage needs quaternions
     * \param gpu_nbor 0 if neighboring will be performed on host
@@ -89,28 +89,28 @@ class Atom {
     *        gpu_nbor 2 if binning on host and neighboring on device **/
   bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
                   const bool bonds, const bool vel=false);
-  
+
   /// Returns true if GPU is using charges
   bool charge() { return _charge; }
-  
+
   /// Returns true if GPU is using quaternions
   bool quaternion() { return _rot; }
-  
+
   /// Returns true if GPU is using velocities
   bool velocity() { return _vel; }
 
   /// Only free matrices of length inum or nall for resizing
   void clear_resize();
-  
+
   /// Free all memory on host and device
   void clear();
- 
+
   /// Return the total amount of host memory used by class in bytes
   double host_memory_usage() const;
 
   /// Sort arrays for neighbor list calculation on device
   void sort_neighbor(const int num_atoms);
-  
+
   /// Add copy times to timers
   inline void acc_timers() {
     time_pos.add_to_total();
@@ -150,18 +150,18 @@ class Atom {
       total+=time_vel.total_seconds();
       time_vel.zero_total();
     }
-    
+
     return total+_time_transfer/1000.0;
   }
-  
+
   /// Return the total time for data cast/pack
   /** Zeros the time so that atom times are only included once **/
-  inline double cast_time() 
+  inline double cast_time()
     { double t=_time_cast; _time_cast=0.0; return t; }
 
   /// Pack LAMMPS atom type constants into matrix and copy to device
   template <class dev_typ, class t1>
-  inline void type_pack1(const int n, const int m_size, 
+  inline void type_pack1(const int n, const int m_size,
                          UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
                          t1 **one) {
     int ii=0;
@@ -215,7 +215,7 @@ class Atom {
     view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
     ucl_copy(dev_v,view,false);
   }
-  
+
   /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
   template <class dev_typ, class t1, class t2, class t3, class t4>
   inline void type_pack4(const int n, const int m_size,
@@ -239,7 +239,7 @@ class Atom {
 
   /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
   template <class dev_typ, class t1, class t2>
-  inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v, 
+  inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
                          UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) {
     for (int i=0; i<n; i++) {
       buffer[i*2]=static_cast<numtyp>(one[i][i]);
@@ -279,7 +279,7 @@ class Atom {
 
   /// Copy positions and types to device asynchronously
   /** Copies nall() elements **/
-  inline void add_x_data(double **host_ptr, int *host_type) { 
+  inline void add_x_data(double **host_ptr, int *host_type) {
     time_pos.start();
     if (_x_avail==false) {
       #ifdef GPU_CAST
@@ -376,7 +376,7 @@ class Atom {
 
   /// Copy velocities and tags to device asynchronously
   /** Copies nall() elements **/
-  inline void add_v_data(double **host_ptr, tagint *host_tag) { 
+  inline void add_v_data(double **host_ptr, tagint *host_tag) {
     time_vel.start();
     if (_v_avail==false) {
       #ifdef GPU_CAST
@@ -407,8 +407,8 @@ class Atom {
   inline void add_transfer_time(double t) { _time_transfer+=t; }
 
   /// Return number of bytes used on device
-  inline double max_gpu_bytes() 
-    { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } 
+  inline double max_gpu_bytes()
+    { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }
 
   /// Returns true if the device is addressing memory on the host
   inline bool host_view() { return _host_view; }
@@ -422,7 +422,7 @@ class Atom {
   /// Quaterions
   UCL_Vector<numtyp,numtyp> quat;
   /// Velocities
-  UCL_Vector<numtyp,numtyp> v;  
+  UCL_Vector<numtyp,numtyp> v;
 
   #ifdef GPU_CAST
   UCL_Vector<double,double> x_cast;
@@ -436,7 +436,7 @@ class Atom {
 
   /// Atom tag information for device nbor builds
   UCL_D_Vec<tagint> dev_tag;
-  
+
   /// Cell list identifiers for hybrid nbor builds
   UCL_H_Vec<int> host_cell_id;
   /// Cell list identifiers for hybrid nbor builds
@@ -444,7 +444,7 @@ class Atom {
 
   /// Device timers
   UCL_Timer time_pos, time_q, time_quat, time_vel;
-  
+
   /// Geryon device
   UCL_Device *dev;
 
@@ -456,19 +456,19 @@ class Atom {
   #endif
 
   bool _compiled;
-  
+
   // True if data has been copied to device already
   bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
 
   bool alloc(const int nall);
-  
+
   bool _allocated, _rot, _charge, _bonds, _vel, _other;
   int _max_atoms, _nall, _gpu_nbor;
   bool _host_view;
   double _time_cast, _time_transfer;
-  
+
   double _max_gpu_bytes;
-  
+
   #ifdef USE_CUDPP
   CUDPPConfiguration sort_config;
   CUDPPHandle sort_plan;
diff --git a/lib/gpu/lal_balance.h b/lib/gpu/lal_balance.h
index cf09cf86fb..e90e94bee1 100644
--- a/lib/gpu/lal_balance.h
+++ b/lib/gpu/lal_balance.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -44,7 +44,7 @@ class Balance {
       _init_done=false;
     }
   }
-  
+
   /// Return the timestep since initialization
   inline int timestep() { return _timestep; }
 
@@ -96,7 +96,7 @@ class Balance {
   inline void stop_timer() { if (_measure_this_step) { _device_time.stop(); } }
 
   /// Calculate the new host/device split based on the cpu and device times
-  /** \note Only does calculation every _HD_BALANCE_EVERY timesteps 
+  /** \note Only does calculation every _HD_BALANCE_EVERY timesteps
             (and first 10) **/
   inline void balance(const double cpu_time);
 
@@ -105,13 +105,13 @@ class Balance {
     balance(cpu_time);
     return get_gpu_count(ago,inum_full);
   }
-  
+
  private:
   Device<numtyp,acctyp> *_device;
   UCL_Timer _device_time;
   bool _init_done;
   int _gpu_nbor;
-  
+
   bool _load_balance;
   double _actual_split, _avg_split, _desired_split, _max_split;
   int _avg_count;
@@ -123,15 +123,15 @@ class Balance {
 #define BalanceT Balance<numtyp,acctyp>
 
 template <class numtyp, class acctyp>
-void BalanceT::init(Device<numtyp, acctyp> *gpu, 
+void BalanceT::init(Device<numtyp, acctyp> *gpu,
                            const int gpu_nbor, const double split) {
   clear();
   _gpu_nbor=gpu_nbor;
   _init_done=true;
-  
+
   _device=gpu;
   _device_time.init(*gpu->gpu);
-  
+
   if (split<0.0) {
     _load_balance=true;
     _desired_split=0.90;
@@ -163,7 +163,7 @@ int BalanceT::get_gpu_count(const int ago, const int inum_full) {
   _timestep++;
   return _inum;
 }
-    
+
 template <class numtyp, class acctyp>
 void BalanceT::balance(const double cpu_time) {
   if (_measure_this_step) {
diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp
index 191f218bd8..e59dae1a6f 100644
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@@ -9,10 +9,10 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
- 
+
 #include "lal_base_atomic.h"
 using namespace LAMMPS_AL;
 #define BaseAtomicT BaseAtomic<numtyp, acctyp>
@@ -63,13 +63,13 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
     _nbor_data=&(nbor->dev_packed);
   } else
     _nbor_data=&(nbor->dev_nbor);
-    
+
   int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
                            maxspecial,_gpu_host,max_nbors,cell_size,false,
                            _threads_per_atom);
   if (success!=0)
     return success;
-    
+
   ucl_device=device->gpu;
   atom=&device->atom;
 
@@ -139,7 +139,7 @@ int * BaseAtomicT::reset_nbors(const int nall, const int inum, int *ilist,
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
-  
+
   return ilist;
 }
 
@@ -188,7 +188,7 @@ void BaseAtomicT::compute(const int f_ago, const int inum_full,
     zero_timers();
     return;
   }
-  
+
   int ago=hd_balancer.ago_first(f_ago);
   int inum=hd_balancer.balance(ago,inum_full,cpu_time);
   ans->inum(inum);
@@ -217,7 +217,7 @@ template <class numtyp, class acctyp>
 int ** BaseAtomicT::compute(const int ago, const int inum_full,
                                  const int nall, double **host_x, int *host_type,
                                  double *sublo, double *subhi, tagint *tag,
-                                 int **nspecial, tagint **special, const bool eflag, 
+                                 int **nspecial, tagint **special, const bool eflag,
                                  const bool vflag, const bool eatom,
                                  const bool vatom, int &host_start,
                                  int **ilist, int **jnum,
@@ -230,12 +230,12 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full,
     zero_timers();
     return NULL;
   }
-  
+
   hd_balancer.balance(cpu_time);
   int inum=hd_balancer.get_gpu_count(ago,inum_full);
   ans->inum(inum);
   host_start=inum;
- 
+
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@@ -255,7 +255,7 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full,
   ans->copy_answers(eflag,vflag,eatom,vatom);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
-  
+
   return nbor->host_jlist.begin()-host_start;
 }
 
diff --git a/lib/gpu/lal_base_atomic.h b/lib/gpu/lal_base_atomic.h
index eaf55f46e2..e3e9829abc 100644
--- a/lib/gpu/lal_base_atomic.h
+++ b/lib/gpu/lal_base_atomic.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -41,7 +41,7 @@ class BaseAtomic {
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * \param k_name name for the kernel for force calculation
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -49,8 +49,8 @@ class BaseAtomic {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
-                  const int maxspecial, const double cell_size, 
-                  const double gpu_split, FILE *screen, 
+                  const int maxspecial, const double cell_size,
+                  const double gpu_split, FILE *screen,
                   const void *pair_program, const char *k_name);
 
   /// Estimate the overhead for GPU context changes and CPU driver
@@ -80,7 +80,7 @@ class BaseAtomic {
     * \note host_inum is 0 if the host is performing neighboring
     * \note nlocal+host_inum=total number local particles
     * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
+  inline void resize_local(const int inum, const int host_inum,
                            const int max_nbors, bool &success) {
     nbor->resize(inum,host_inum,max_nbors,success);
   }
@@ -119,7 +119,7 @@ class BaseAtomic {
   /// Build neighbor list on device
   void build_nbor_list(const int inum, const int host_inum,
                        const int nall, double **host_x, int *host_type,
-                       double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                       double *sublo, double *subhi, tagint *tag, int **nspecial,
                        tagint **special, bool &success);
 
   /// Pair loop with host neighboring
@@ -133,19 +133,19 @@ class BaseAtomic {
   int * compute(const int ago, const int inum_full,
                 const int nall, double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 const double cpu_time, bool &success);
 
   /// Pair loop with device neighboring
   int ** compute(const int ago, const int inum_full,
                  const int nall, double **host_x, int *host_type, double *sublo,
                  double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag, 
-                 const bool eatom, const bool vatom, int &host_start, 
+                 tagint **special, const bool eflag, const bool vflag,
+                 const bool eatom, const bool vatom, int &host_start,
                  int **ilist, int **numj, const double cpu_time, bool &success);
 
-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
   Device<numtyp,acctyp> *device;
diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp
index e7fe2b62f4..c6341f7d57 100644
--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@@ -10,7 +10,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -64,7 +64,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
     _nbor_data=&(nbor->dev_packed);
   } else
     _nbor_data=&(nbor->dev_nbor);
-    
+
   int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
                            maxspecial,_gpu_host,max_nbors,cell_size,false,
                            _threads_per_atom);
@@ -153,7 +153,7 @@ template <class numtyp, class acctyp>
 inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
                                          const int nall, double **host_x,
                                          int *host_type, double *sublo,
-                                         double *subhi, tagint *tag, 
+                                         double *subhi, tagint *tag,
                                          int **nspecial, tagint **special,
                                          bool &success) {
   success=true;
@@ -192,7 +192,7 @@ void BaseChargeT::compute(const int f_ago, const int inum_full,
     zero_timers();
     return;
   }
-  
+
   int ago=hd_balancer.ago_first(f_ago);
   int inum=hd_balancer.balance(ago,inum_full,cpu_time);
   ans->inum(inum);
@@ -226,7 +226,7 @@ template <class numtyp, class acctyp>
 int** BaseChargeT::compute(const int ago, const int inum_full,
                                 const int nall, double **host_x, int *host_type,
                                 double *sublo, double *subhi, tagint *tag,
-                                int **nspecial, tagint **special, const bool eflag, 
+                                int **nspecial, tagint **special, const bool eflag,
                                 const bool vflag, const bool eatom,
                                 const bool vatom, int &host_start,
                                 int **ilist, int **jnum,
@@ -240,12 +240,12 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
     zero_timers();
     return NULL;
   }
-  
+
   hd_balancer.balance(cpu_time);
   int inum=hd_balancer.get_gpu_count(ago,inum_full);
   ans->inum(inum);
   host_start=inum;
- 
+
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@@ -271,7 +271,7 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
   ans->copy_answers(eflag,vflag,eatom,vatom);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
-  
+
   return nbor->host_jlist.begin()-host_start;
 }
 
diff --git a/lib/gpu/lal_base_charge.h b/lib/gpu/lal_base_charge.h
index e791507432..64c19554b9 100644
--- a/lib/gpu/lal_base_charge.h
+++ b/lib/gpu/lal_base_charge.h
@@ -10,7 +10,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -42,7 +42,7 @@ class BaseCharge {
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * \param k_name name for the kernel for force calculation
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -83,7 +83,7 @@ class BaseCharge {
     * \note host_inum is 0 if the host is performing neighboring
     * \note nlocal+host_inum=total number local particles
     * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
+  inline void resize_local(const int inum, const int host_inum,
                            const int max_nbors, bool &success) {
     nbor->resize(inum,host_inum,max_nbors,success);
   }
@@ -137,12 +137,12 @@ class BaseCharge {
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double *charge, double *boxlo, double *prd);
 
-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
   Device<numtyp,acctyp> *device;
diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp
index 12e3b20d96..478f0092c7 100644
--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@@ -10,7 +10,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -65,7 +65,7 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
     _nbor_data=&(nbor->dev_packed);
   } else
     _nbor_data=&(nbor->dev_nbor);
-    
+
   int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor,
                            maxspecial,_gpu_host,max_nbors,cell_size,false,
                            _threads_per_atom);
@@ -155,7 +155,7 @@ template <class numtyp, class acctyp>
 inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum,
                                          const int nall, double **host_x,
                                          int *host_type, double *sublo,
-                                         double *subhi, tagint *tag, 
+                                         double *subhi, tagint *tag,
                                          int **nspecial, tagint **special,
                                          bool &success) {
   success=true;
@@ -194,7 +194,7 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full,
     zero_timers();
     return;
   }
-  
+
   int ago=hd_balancer.ago_first(f_ago);
   int inum=hd_balancer.balance(ago,inum_full,cpu_time);
   ans->inum(inum);
@@ -230,12 +230,12 @@ template <class numtyp, class acctyp>
 int** BaseDipoleT::compute(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            double *sublo, double *subhi, tagint *tag,
-                           int **nspecial, tagint **special, const bool eflag, 
+                           int **nspecial, tagint **special, const bool eflag,
                            const bool vflag, const bool eatom,
                            const bool vatom, int &host_start,
                            int **ilist, int **jnum,
                            const double cpu_time, bool &success,
-                           double *host_q, double **host_mu, 
+                           double *host_q, double **host_mu,
                            double *boxlo, double *prd) {
   acc_timers();
   if (inum_full==0) {
@@ -245,12 +245,12 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
     zero_timers();
     return NULL;
   }
-  
+
   hd_balancer.balance(cpu_time);
   int inum=hd_balancer.get_gpu_count(ago,inum_full);
   ans->inum(inum);
   host_start=inum;
- 
+
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@@ -279,7 +279,7 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
   ans->copy_answers(eflag,vflag,eatom,vatom);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
-  
+
   return nbor->host_jlist.begin()-host_start;
 }
 
diff --git a/lib/gpu/lal_base_dipole.h b/lib/gpu/lal_base_dipole.h
index 2e495c8747..b51c4303cf 100644
--- a/lib/gpu/lal_base_dipole.h
+++ b/lib/gpu/lal_base_dipole.h
@@ -10,7 +10,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -40,7 +40,7 @@ class BaseDipole {
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * \param k_name name for the kernel for force calculation
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -82,7 +82,7 @@ class BaseDipole {
     * \note host_inum is 0 if the host is performing neighboring
     * \note nlocal+host_inum=total number local particles
     * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
+  inline void resize_local(const int inum, const int host_inum,
                            const int max_nbors, bool &success) {
     nbor->resize(inum,host_inum,max_nbors,success);
   }
@@ -136,12 +136,12 @@ class BaseDipole {
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double *charge, double **mu, double *boxlo, double *prd);
 
-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
   Device<numtyp,acctyp> *device;
diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp
index 0efb68a9fb..941f463b14 100644
--- a/lib/gpu/lal_base_dpd.cpp
+++ b/lib/gpu/lal_base_dpd.cpp
@@ -64,7 +64,7 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
     _nbor_data=&(nbor->dev_packed);
   } else
     _nbor_data=&(nbor->dev_nbor);
-    
+
   int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
                            maxspecial,_gpu_host,max_nbors,cell_size,false,
                            _threads_per_atom,true);
@@ -153,7 +153,7 @@ template <class numtyp, class acctyp>
 inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum,
                                       const int nall, double **host_x,
                                       int *host_type, double *sublo,
-                                      double *subhi, tagint *tag, 
+                                      double *subhi, tagint *tag,
                                       int **nspecial, tagint **special,
                                       bool &success) {
   success=true;
@@ -182,7 +182,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full,
                        const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom,
                        int &host_start, const double cpu_time,
-                       bool &success, tagint *tag, double **host_v, 
+                       bool &success, tagint *tag, double **host_v,
                        const double dtinvsqrt, const int seed, const int timestep,
                        const int nlocal, double *boxlo, double *prd) {
   acc_timers();
@@ -193,7 +193,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full,
     zero_timers();
     return;
   }
-  
+
   int ago=hd_balancer.ago_first(f_ago);
   int inum=hd_balancer.balance(ago,inum_full,cpu_time);
   ans->inum(inum);
@@ -228,12 +228,12 @@ template <class numtyp, class acctyp>
 int** BaseDPDT::compute(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, tagint *tag,
-                        int **nspecial, tagint **special, const bool eflag, 
+                        int **nspecial, tagint **special, const bool eflag,
                         const bool vflag, const bool eatom,
                         const bool vatom, int &host_start,
                         int **ilist, int **jnum,
                         const double cpu_time, bool &success,
-                        double **host_v, const double dtinvsqrt, 
+                        double **host_v, const double dtinvsqrt,
                         const int seed, const int timestep,
                         double *boxlo, double *prd) {
   acc_timers();
@@ -244,12 +244,12 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
     zero_timers();
     return NULL;
   }
-  
+
   hd_balancer.balance(cpu_time);
   int inum=hd_balancer.get_gpu_count(ago,inum_full);
   ans->inum(inum);
   host_start=inum;
- 
+
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@@ -276,7 +276,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
   ans->copy_answers(eflag,vflag,eatom,vatom);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
-  
+
   return nbor->host_jlist.begin()-host_start;
 }
 
diff --git a/lib/gpu/lal_base_dpd.h b/lib/gpu/lal_base_dpd.h
index 97640ed40e..7a75282d0a 100644
--- a/lib/gpu/lal_base_dpd.h
+++ b/lib/gpu/lal_base_dpd.h
@@ -40,7 +40,7 @@ class BaseDPD {
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * \param k_name name for the kernel for force calculation
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -81,7 +81,7 @@ class BaseDPD {
     * \note host_inum is 0 if the host is performing neighboring
     * \note nlocal+host_inum=total number local particles
     * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
+  inline void resize_local(const int inum, const int host_inum,
                            const int max_nbors, bool &success) {
     nbor->resize(inum,host_inum,max_nbors,success);
   }
@@ -129,20 +129,20 @@ class BaseDPD {
                int **firstneigh, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                const double cpu_time, bool &success, tagint *tag,
-               double **v, const double dtinvsqrt, const int seed, 
+               double **v, const double dtinvsqrt, const int seed,
                const int timestep, const int nlocal, double *boxlo, double *prd);
 
   /// Pair loop with device neighboring
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double **v, const double dtinvsqrt, const int seed,
                 const int timestep, double *boxlo, double *prd);
 
-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
   Device<numtyp,acctyp> *device;
diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp
index 4200c02e1c..8918a3140c 100644
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@@ -70,7 +70,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
     _gpu_host=1;
 
   _threads_per_atom=device->threads_per_atom();
-    
+
   int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,
                            maxspecial,_gpu_host,max_nbors,cell_size,true,
                            1);
@@ -113,7 +113,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
     return -8;
   if (_multiple_forms && gpu_nbor!=0)
     return -9;
-  
+
   if (_multiple_forms)
     ans->force.zero();
 
@@ -142,7 +142,7 @@ void BaseEllipsoidT::clear_base() {
   // Output any timing information
   output_times();
   host_olist.clear();
-  
+
   if (_compiled) {
     k_nbor_fast.clear();
     k_nbor.clear();
@@ -156,7 +156,7 @@ void BaseEllipsoidT::clear_base() {
     delete lj_program;
     _compiled=false;
   }
- 
+
   time_nbor1.clear();
   time_ellipsoid.clear();
   time_nbor2.clear();
@@ -230,7 +230,7 @@ void BaseEllipsoidT::output_times() {
       if (times[6]>0)
         fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
-      fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);      
+      fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
       fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
       fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[8]/replica_size);
@@ -241,10 +241,10 @@ void BaseEllipsoidT::output_times() {
 }
 
 // ---------------------------------------------------------------------------
-// Pack neighbors to limit thread divergence for lj-lj and ellipse 
+// Pack neighbors to limit thread divergence for lj-lj and ellipse
 // ---------------------------------------------------------------------------
 template<class numtyp, class acctyp>
-void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, 
+void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
                                 const int inum, const int form_low,
                                 const int form_high, const bool shared_types,
                                 int ntypes) {
@@ -264,18 +264,18 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
 // Copy neighbor list from host
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseEllipsoidT::reset_nbors(const int nall, const int inum, 
+void BaseEllipsoidT::reset_nbors(const int nall, const int inum,
                                  const int osize, int *ilist,
                                  int *numj, int *type, int **firstneigh,
                                  bool &success) {
   success=true;
-    
+
   int mn=nbor->max_nbor_loop(osize,numj,ilist);
   resize_atom(nall,success);
   resize_local(inum,0,mn,osize,success);
   if (!success)
     return;
-    
+
   if (_multiple_forms) {
     int p=0;
     for (int i=0; i<osize; i++) {
@@ -315,7 +315,7 @@ template <class numtyp, class acctyp>
 inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum,
                                             const int nall, double **host_x,
                                             int *host_type, double *sublo,
-                                            double *subhi, tagint *tag, 
+                                            double *subhi, tagint *tag,
                                             int **nspecial, tagint **special,
                                             bool &success) {
   success=true;
@@ -354,7 +354,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
     zero_timers();
     return NULL;
   }
-  
+
   int ago=hd_balancer.ago_first(f_ago);
   int inum=hd_balancer.balance(ago,inum_full,cpu_time);
   ans->inum(inum);
@@ -394,7 +394,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
                               double **host_x, int *host_type, double *sublo,
                               double *subhi, tagint *tag, int **nspecial,
                               tagint **special, const bool eflag, const bool vflag,
-                              const bool eatom, const bool vatom, 
+                              const bool eatom, const bool vatom,
                               int &host_start, int **ilist, int **jnum,
                               const double cpu_time, bool &success,
                               double **host_quat) {
@@ -410,7 +410,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
   ans->inum(inum);
   _last_ellipse=std::min(inum,_max_last_ellipse);
   host_start=inum;
-  
+
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@@ -419,7 +419,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
       return NULL;
     atom->cast_quat_data(host_quat[0]);
     hd_balancer.start_timer();
-  } else {    
+  } else {
     atom->cast_x_data(host_x,host_type);
     atom->cast_quat_data(host_quat[0]);
     hd_balancer.start_timer();
@@ -444,9 +444,9 @@ double BaseEllipsoidT::host_memory_usage_base() const {
 }
 
 template <class numtyp, class acctyp>
-void BaseEllipsoidT::compile_kernels(UCL_Device &dev, 
+void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
                                      const void *ellipsoid_string,
-                                     const void *lj_string, 
+                                     const void *lj_string,
                                      const char *kname, const bool e_s) {
   if (_compiled)
     return;
diff --git a/lib/gpu/lal_base_ellipsoid.h b/lib/gpu/lal_base_ellipsoid.h
index e289430f43..7deeccbf44 100644
--- a/lib/gpu/lal_base_ellipsoid.h
+++ b/lib/gpu/lal_base_ellipsoid.h
@@ -42,7 +42,7 @@ class BaseEllipsoid {
     * \param gpu_split fraction of particles handled by device
     * \param ellipsoid_sphere true if ellipsoid-sphere case handled separately
     * \param k_name name for the kernel for force calculation
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -68,7 +68,7 @@ class BaseEllipsoid {
       quat_tex.bind_float(atom->quat,4);
       lj_pos_tex.bind_float(atom->x,4);
       lj_quat_tex.bind_float(atom->quat,4);
-    }      
+    }
   }
 
   /// Check if there is enough storage for neighbors and realloc if not
@@ -78,7 +78,7 @@ class BaseEllipsoid {
     * \param olist_size size of list of particles from CPU neighboring
     * \note host_inum is 0 if the host is performing neighboring
     * \note if GPU is neighboring nlocal+host_inum=total number local particles
-    * \note if CPU is neighboring olist_size=total number of local particles 
+    * \note if CPU is neighboring olist_size=total number of local particles
     * \note if GPU is neighboring olist_size=0 **/
   inline void resize_local(const int nlocal, const int host_inum,
                            const int max_nbors, const int olist_size,
@@ -101,7 +101,7 @@ class BaseEllipsoid {
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear_base();
-  
+
   /// Output any timing information
   void output_times();
 
@@ -130,7 +130,7 @@ class BaseEllipsoid {
       ans->acc_timers();
     }
   }
-  
+
   /// Zero timers
   inline void zero_timers() {
     time_nbor1.zero();
@@ -148,9 +148,9 @@ class BaseEllipsoid {
     ans->zero_timers();
   }
 
-  /// Pack neighbors to limit thread divergence for lj-lj and ellipse 
+  /// Pack neighbors to limit thread divergence for lj-lj and ellipse
   void pack_nbors(const int GX, const int BX, const int start, const int inum,
-                  const int form_low, const int form_high, 
+                  const int form_low, const int form_high,
                   const bool shared_types, int ntypes);
 
   /// Copy neighbor list from host
@@ -174,17 +174,17 @@ class BaseEllipsoid {
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double **host_quat);
 
   /// Build neighbor list on accelerator
-  void build_nbor_list(const int inum, const int host_inum, const int nall, 
+  void build_nbor_list(const int inum, const int host_inum, const int nall,
                        double **host_x, int *host_type, double *sublo,
                        double *subhi, bool &success);
-                       
-  // -------------------------- DEVICE DATA ------------------------- 
+
+  // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
   Device<numtyp,acctyp> *device;
@@ -207,7 +207,7 @@ class BaseEllipsoid {
   /// Atom Data
   Atom<numtyp,acctyp> *atom;
 
-  // --------------------------- TYPE DATA -------------------------- 
+  // --------------------------- TYPE DATA --------------------------
 
   /// cut_form.x = cutsq, cut_form.y = form
   UCL_D_Vec<numtyp2> cut_form;
@@ -240,7 +240,7 @@ class BaseEllipsoid {
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
-  // True if we want to use fast GB-sphere or sphere-sphere calculations 
+  // True if we want to use fast GB-sphere or sphere-sphere calculations
   bool _multiple_forms;
   int **_host_form;
   int _last_ellipse, _max_last_ellipse;
diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp
index c41aad7b58..14f642e55b 100644
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@@ -12,7 +12,7 @@
     begin                : Tue April 2, 2013
     email                : brownw@ornl.gov
  ***************************************************************************/
- 
+
 #include "lal_base_three.h"
 using namespace LAMMPS_AL;
 #define BaseThreeT BaseThree<numtyp, acctyp>
@@ -45,7 +45,7 @@ int BaseThreeT::bytes_per_atom_atomic(const int max_nbors) const {
   #ifdef THREE_CONCURRENT
   b+=ans2->bytes_per_atom();
   #endif
-  return b;     
+  return b;
 }
 
 template <class numtyp, class acctyp>
@@ -62,6 +62,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
     gpu_nbor=1;
   else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
     gpu_nbor=2;
+  _gpu_nbor=gpu_nbor;
 
   int _gpu_host=0;
   int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
@@ -76,7 +77,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
     _nbor_data=&(nbor->dev_nbor);
   if (_threads_per_atom*_threads_per_atom>device->warp_size())
     return -10;
-    
+
   int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
                            maxspecial,_gpu_host,max_nbors,cell_size,false,
                            _threads_per_atom);
@@ -93,7 +94,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
     return -3;
   ans2->cq(_end_command_queue);
   #endif
-    
+
   _block_pair=device->pair_block_size();
   _block_size=device->block_ellipse();
   compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);
@@ -111,7 +112,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
   #ifdef THREE_CONCURRENT
   _max_an_bytes+=ans2->gpu_bytes();
   #endif
-  
+
   return 0;
 }
 
@@ -158,7 +159,7 @@ void BaseThreeT::clear_atomic() {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
-                              int *ilist, int *numj, int **firstneigh, 
+                              int *ilist, int *numj, int **firstneigh,
                               bool &success) {
   success=true;
 
@@ -168,7 +169,12 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
   if (!success)
     return NULL;
 
-  nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
+  // originally the requirement that nall == nlist was enforced
+  // to allow direct indexing neighbors of neighbors after re-arrangement
+//  nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
+
+  // now the requirement is removed, allowing to work within pair hybrid
+  nbor->get_host(nlist,ilist,numj,firstneigh,block_size());
 
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   #ifdef THREE_CONCURRENT
@@ -176,7 +182,7 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
   #endif
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
-  
+
   return ilist;
 }
 
@@ -185,11 +191,11 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
-                                         const int nall, double **host_x,
-                                         int *host_type, double *sublo,
-                                         double *subhi, tagint *tag,
-                                         int **nspecial, tagint **special,
-                                         bool &success) {
+                                       const int nall, double **host_x,
+                                       int *host_type, double *sublo,
+                                       double *subhi, tagint *tag,
+                                       int **nspecial, tagint **special,
+                                       bool &success) {
   success=true;
   resize_atom(inum,nall,success);
   resize_local(nall,host_inum,nbor->max_nbors(),success);
@@ -214,11 +220,11 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall, 
+void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
                          const int nlist, double **host_x, int *host_type,
-                         int *ilist, int *numj, int **firstneigh, 
+                         int *ilist, int *numj, int **firstneigh,
                          const bool eflag, const bool vflag, const bool eatom,
-                         const bool vatom, int &host_start, 
+                         const bool vatom, int &host_start,
                          const double cpu_time, bool &success) {
   acc_timers();
   if (nlist==0) {
@@ -228,9 +234,9 @@ void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall,
     zero_timers();
     return;
   }
-  
+
   int ago=hd_balancer.ago_first(f_ago);
-  int inum=hd_balancer.balance(ago,nlocal,cpu_time);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
   ans->inum(inum);
   #ifdef THREE_CONCURRENT
   ans2->inum(inum);
@@ -270,7 +276,7 @@ template <class numtyp, class acctyp>
 int ** BaseThreeT::compute(const int ago, const int inum_full,
                                  const int nall, double **host_x, int *host_type,
                                  double *sublo, double *subhi, tagint *tag,
-                                 int **nspecial, tagint **special, const bool eflag, 
+                                 int **nspecial, tagint **special, const bool eflag,
                                  const bool vflag, const bool eatom,
                                  const bool vatom, int &host_start,
                                  int **ilist, int **jnum,
@@ -283,7 +289,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
     zero_timers();
     return NULL;
   }
-  
+
   hd_balancer.balance(cpu_time);
   int inum=hd_balancer.get_gpu_count(ago,inum_full);
   ans->inum(inum);
@@ -291,7 +297,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
   ans2->inum(inum);
   #endif
   host_start=inum;
- 
+
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@@ -321,7 +327,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
   device->add_ans_object(ans2);
   #endif
   hd_balancer.stop_timer();
-  
+
   return nbor->host_jlist.begin()-host_start;
 }
 
@@ -352,7 +358,7 @@ void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
   k_three_end.cq(ucl_device->cq(_end_command_queue));
   k_three_end_vatom.cq(ucl_device->cq(_end_command_queue));
   #endif
-  
+
   _compiled=true;
 }
 
diff --git a/lib/gpu/lal_base_three.h b/lib/gpu/lal_base_three.h
index 0af290469a..4f27ecdf92 100644
--- a/lib/gpu/lal_base_three.h
+++ b/lib/gpu/lal_base_three.h
@@ -44,7 +44,7 @@ class BaseThree {
     * \param gpu_split fraction of particles handled by device
     * \param k_two name for the kernel for 2-body force calculation
     * \param k_three name for the kernel for 3-body force calculation
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -53,8 +53,8 @@ class BaseThree {
     * - -5 Double precision is not supported on card
     * - -10 if invalid thread_per_atom setting **/
   int init_three(const int nlocal, const int nall, const int max_nbors,
-                 const int maxspecial, const double cell_size, 
-                 const double gpu_split, FILE *screen, 
+                 const int maxspecial, const double cell_size,
+                 const double gpu_split, FILE *screen,
                  const void *pair_program, const char *k_two,
                  const char *k_three_center, const char *k_three_end);
 
@@ -88,7 +88,7 @@ class BaseThree {
     * \note host_inum is 0 if the host is performing neighboring
     * \note nlocal+host_inum=total number local particles
     * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
+  inline void resize_local(const int inum, const int host_inum,
                            const int max_nbors, bool &success) {
     nbor->resize(inum,host_inum,max_nbors,success);
   }
@@ -133,33 +133,33 @@ class BaseThree {
   /// Build neighbor list on device
   int build_nbor_list(const int inum, const int host_inum,
                        const int nall, double **host_x, int *host_type,
-                       double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                       double *sublo, double *subhi, tagint *tag, int **nspecial,
                        tagint **special, bool &success);
 
   /// Pair loop with host neighboring
-  void compute(const int f_ago, const int inum_full, const int nall, 
+  void compute(const int f_ago, const int inum_full, const int nall,
                const int nlist, double **host_x, int *host_type,
                int *ilist, int *numj, int **firstneigh, const bool eflag,
                const bool vflag, const bool eatom, const bool vatom,
                int &host_start, const double cpu_time, bool &success);
 
   /// Pair loop with device neighboring
-  int * compute(const int ago, const int inum_full, const int nall, 
+  int * compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 const double cpu_time, bool &success);
 
   /// Pair loop with device neighboring
   int ** compute(const int ago, const int inum_full,
                  const int nall, double **host_x, int *host_type, double *sublo,
                  double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag, 
-                 const bool eatom, const bool vatom, int &host_start, 
+                 tagint **special, const bool eflag, const bool vflag,
+                 const bool eatom, const bool vatom, int &host_start,
                  int **ilist, int **numj, const double cpu_time, bool &success);
 
-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
   Device<numtyp,acctyp> *device;
@@ -186,7 +186,7 @@ class BaseThree {
   Answer<numtyp,acctyp> *ans;
   #ifdef THREE_CONCURRENT
   Answer<numtyp,acctyp> *ans2;
-  #endif  
+  #endif
 
   // --------------------------- NBOR DATA ----------------------------
 
@@ -205,15 +205,16 @@ class BaseThree {
  protected:
   bool _compiled;
   int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
+  int _gpu_nbor;
   double _max_bytes, _max_an_bytes;
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
-  void compile_kernels(UCL_Device &dev, const void *pair_string, 
+  void compile_kernels(UCL_Device &dev, const void *pair_string,
                        const char *k_two, const char *k_three_center,
                        const char *k_three_end);
 
-  virtual void loop(const bool _eflag, const bool _vflag, 
+  virtual void loop(const bool _eflag, const bool _vflag,
                     const int evatom) = 0;
 };
 
diff --git a/lib/gpu/lal_beck.cpp b/lib/gpu/lal_beck.cpp
index 062c095957..165a02b71a 100644
--- a/lib/gpu/lal_beck.cpp
+++ b/lib/gpu/lal_beck.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,17 +33,17 @@ BeckT::Beck() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-BeckT::~Beck() { 
+BeckT::~Beck() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BeckT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int BeckT::init(const int ntypes, 
+int BeckT::init(const int ntypes,
                 double **host_cutsq, double **host_aa,
                 double **host_alpha, double **host_beta,
                 double **host_AA, double **host_BB,
@@ -126,7 +126,7 @@ void BeckT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
diff --git a/lib/gpu/lal_beck.cu b/lib/gpu/lal_beck.cu
index 7ccefd8859..7d72128b5f 100644
--- a/lib/gpu/lal_beck.cu
+++ b/lib/gpu/lal_beck.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -24,7 +24,7 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_beck(const __global numtyp4 *restrict x_, 
+__kernel void k_beck(const __global numtyp4 *restrict x_,
                      const __global numtyp4 *restrict beck1,
                      const __global numtyp4 *restrict beck2,
                      const int lj_types,
@@ -50,20 +50,20 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -76,7 +76,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<beck2[mtype].z) {
         numtyp r = ucl_sqrt(rsq);
@@ -103,7 +103,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
           numtyp term1inv = ucl_recip(term1);
           numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
           e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -133,7 +133,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
                           const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 beck1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 beck2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
@@ -143,7 +143,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
     beck1[tid]=beck1_in[tid];
     beck2[tid]=beck2_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -152,7 +152,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -166,7 +166,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -179,7 +179,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<beck2[mtype].z) {
         numtyp r = ucl_sqrt(rsq);
         numtyp r5 = rsq*rsq*r;
@@ -205,7 +205,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
           numtyp term1inv = ucl_recip(term1);
           numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
           e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_beck.h b/lib/gpu/lal_beck.h
index fa56db2402..db26bebeb0 100644
--- a/lib/gpu/lal_beck.h
+++ b/lib/gpu/lal_beck.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Beck : public BaseAtomic<numtyp, acctyp> {
  public:
   Beck();
-  ~Beck(); 
+  ~Beck();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -41,8 +41,8 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
            double **host_aa, double **host_alpha,
            double **host_beta, double **host_AA,
            double **host_BB, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
@@ -67,7 +67,7 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_beck_ext.cpp b/lib/gpu/lal_beck_ext.cpp
index 28ca0df346..226c2d477b 100644
--- a/lib/gpu/lal_beck_ext.cpp
+++ b/lib/gpu/lal_beck_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -77,7 +77,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
                         cell_size, gpu_split, screen);
 
     BLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,8 +102,8 @@ int ** beck_gpu_compute_n(const int ago, const int inum_full,
   return BLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void beck_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_born.cpp b/lib/gpu/lal_born.cpp
index 55cb24d3b0..7c1ed944d3 100644
--- a/lib/gpu/lal_born.cpp
+++ b/lib/gpu/lal_born.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,10 +33,10 @@ BornT::Born() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-BornT::~Born() { 
+BornT::~Born() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BornT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -44,12 +44,12 @@ int BornT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int BornT::init(const int ntypes, double **host_cutsq,
-                double **host_rhoinv, double **host_born1, double **host_born2, 
+                double **host_rhoinv, double **host_born1, double **host_born2,
                 double **host_born3, double **host_a, double **host_c,
                 double **host_d, double **host_sigma,
                 double **host_offset, double *host_special_lj,
-                const int nlocal, const int nall, const int max_nbors, 
-                const int maxspecial, const double cell_size, 
+                const int nlocal, const int nall, const int max_nbors,
+                const int maxspecial, const double cell_size,
                 const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@@ -80,7 +80,7 @@ int BornT::init(const int ntypes, double **host_cutsq,
 
   coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		                     host_d,host_offset);
+                                     host_d,host_offset);
 
   cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack2(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
@@ -102,18 +102,18 @@ void BornT::reinit(const int ntypes, double **host_rhoinv,
                    double **host_born1, double **host_born2,
                    double **host_born3, double **host_a, double **host_c,
                    double **host_d, double **host_offset) {
-  
+
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
                          host_born1,host_born2,host_born3);
   this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
-		                     host_d,host_offset);
+                                     host_d,host_offset);
 }
 
 template <class numtyp, class acctyp>
@@ -151,7 +151,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -169,7 +169,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &coeff1, &coeff2,
-                     &cutsq_sigma, &_lj_types, &sp_lj, 
+                     &cutsq_sigma, &_lj_types, &sp_lj,
                      &this->nbor->dev_nbor,
                      &this->_nbor_data->begin(), &this->ans->force,
                      &this->ans->engv, &eflag, &vflag, &ainum,
diff --git a/lib/gpu/lal_born.cu b/lib/gpu/lal_born.cu
index 5f917be846..0ca7fea5fe 100644
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -24,16 +24,16 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_born(const __global numtyp4 *restrict x_, 
+__kernel void k_born(const __global numtyp4 *restrict x_,
                      const __global numtyp4 *restrict coeff1,
-                     const __global numtyp4 *restrict coeff2, 
+                     const __global numtyp4 *restrict coeff2,
                      const __global numtyp2 *restrict cutsq_sigma,
-                     const int lj_types, 
-                     const __global numtyp *restrict sp_lj_in, 
+                     const int lj_types,
+                     const __global numtyp *restrict sp_lj_in,
                      const __global int *dev_nbor,
-                     const __global int *dev_packed, 
+                     const __global int *dev_packed,
                      __global acctyp4 *restrict ans,
-                     __global acctyp *restrict engv, 
+                     __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -51,20 +51,20 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -77,17 +77,17 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (r2inv<cutsq_sigma[mtype].x) {
         numtyp r=ucl_sqrt(r2inv);
         numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
+        numtyp force = r2inv*(coeff1[mtype].y*r*rexp
                 - coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
         force*=factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -95,7 +95,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
         if (eflag>0) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
             + coeff2[mtype].z*r2inv*r6inv;
-          energy+=factor_lj*(e-coeff2[mtype].w); 
+          energy+=factor_lj*(e-coeff2[mtype].w);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -113,20 +113,20 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_born_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_born_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict coeff1_in,
-                          const __global numtyp4 *restrict coeff2_in, 
+                          const __global numtyp4 *restrict coeff2_in,
                           const __global numtyp2 *restrict cutsq_sigma,
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
-                          __global acctyp4 *restrict ans, 
-                          __global acctyp *restrict engv, 
-                          const int eflag, const int vflag, const int inum, 
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv,
+                          const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
@@ -137,7 +137,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       coeff2[tid]=coeff2_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -146,7 +146,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -160,7 +160,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -173,13 +173,13 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       if (r2inv<cutsq_sigma[mtype].x) {
         numtyp r=ucl_sqrt(r2inv);
         numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
+        numtyp force = r2inv*(coeff1[mtype].y*r*rexp
           - coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
         force*=factor_lj;
 
@@ -190,7 +190,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
         if (eflag>0) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
             + coeff2[mtype].z*r2inv*r6inv;
-          energy+=factor_lj*(e-coeff2[mtype].w);  
+          energy+=factor_lj*(e-coeff2[mtype].w);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_born.h b/lib/gpu/lal_born.h
index 6fed6461d2..685f4d87a9 100644
--- a/lib/gpu/lal_born.h
+++ b/lib/gpu/lal_born.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Born : public BaseAtomic<numtyp, acctyp> {
  public:
   Born();
-  ~Born(); 
+  ~Born();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -38,20 +38,20 @@ class Born : public BaseAtomic<numtyp, acctyp> {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq,
-           double **host_rhoinv, double **host_born1, double **host_born2, 
+           double **host_rhoinv, double **host_born1, double **host_born2,
            double **host_born3, double **host_a, double **host_c,
-           double **host_d, double **host_sigma, 
+           double **host_d, double **host_sigma,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
-  
+
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **host_rhoinv,
               double **host_born1, double **host_born2,
               double **host_born3, double **host_a, double **host_c,
               double **host_d, double **host_offset);
-       
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -77,7 +77,7 @@ class Born : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_born_coul_long.cpp b/lib/gpu/lal_born_coul_long.cpp
index 94becf8c69..68695c4938 100644
--- a/lib/gpu/lal_born_coul_long.cpp
+++ b/lib/gpu/lal_born_coul_long.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -37,17 +37,17 @@ template <class numtyp, class acctyp>
 BornCoulLongT::~BornCoulLongT() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BornCoulLongT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
-                       double **host_born1, double **host_born2, double **host_born3, 
-                       double **host_a, double **host_c, double **host_d, 
-                       double **host_sigma, double **host_offset, 
+int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
+                       double **host_born1, double **host_born2, double **host_born3,
+                       double **host_a, double **host_c, double **host_d,
+                       double **host_sigma, double **host_offset,
                        double *host_special_lj, const int nlocal,
                        const int nall, const int max_nbors,
                        const int maxspecial, const double cell_size,
@@ -84,12 +84,12 @@ int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rho
 
   coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		         host_d,host_offset);
-  
+                         host_d,host_offset);
+
   cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
              host_cut_ljsq,host_sigma);
-  
+
   sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
     host_write[i]=host_special_lj[i];
@@ -142,7 +142,7 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -157,15 +157,15 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
                           &this->ans->force,
                           &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->atom->q,
-                          &cutsq_sigma, &_cut_coulsq, &_qqrd2e, 
+                          &cutsq_sigma, &_cut_coulsq, &_qqrd2e,
                           &_g_ewald, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, 
+    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->ans->force, &this->ans->engv, 
+                   &this->ans->force, &this->ans->engv,
                    &eflag, &vflag, &ainum,
-                   &nbor_pitch, &this->atom->q, 
+                   &nbor_pitch, &this->atom->q,
                    &cutsq_sigma, &_cut_coulsq,
                    &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
diff --git a/lib/gpu/lal_born_coul_long.cu b/lib/gpu/lal_born_coul_long.cu
index 3d74f2087a..4cb4ea448f 100644
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -29,19 +29,19 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_born_long(const __global numtyp4 *restrict x_, 
+__kernel void k_born_long(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict coeff1,
                           const __global numtyp4 *restrict coeff2,
-                          const int lj_types, 
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
+                          const int lj_types,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
+                          __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
-                          const int nbor_pitch, 
+                          const int nbor_pitch,
                           const __global numtyp *restrict q_,
-                          const __global numtyp4 *restrict cutsq_sigma, 
+                          const __global numtyp4 *restrict cutsq_sigma,
                           const numtyp cut_coulsq, const numtyp qqrd2e,
                           const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
@@ -64,14 +64,14 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -114,129 +114,129 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
           numtyp r = ucl_sqrt(rsq);
           rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
-          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
-            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
-        } else forceborn = (numtyp)0.0;
-
-        force = (forceborn + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < coeff1[mtype].w) {
-            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
-              + coeff2[mtype].z*r2inv*r6inv;
-            energy+=factor_lj*(e-coeff2[mtype].w);
-          } 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
-  } // if ii
-}
-
-__kernel void k_born_long_fast(const __global numtyp4 *restrict x_, 
-                               const __global numtyp4 *restrict coeff1_in,
-                               const __global numtyp4 *restrict coeff2_in, 
-                               const __global numtyp *restrict sp_lj_in,
-                               const __global int *dev_nbor, 
-                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans, 
-                               __global acctyp *restrict engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, 
-                               const __global numtyp *restrict q_,
-                               const __global numtyp4 *restrict cutsq_sigma,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const int t_per_atom) {
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[8];
-  if (tid<8)
-    sp_lj[tid]=sp_lj_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
-      coeff2[tid]=coeff2_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    __local int n_stride;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-  
-    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    numtyp qtmp; fetch(qtmp,i,q_tex);
-    int iw=ix.w;
-    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-      int j=dev_packed[nbor];
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq<cutsq_sigma[mtype].x) {
-        numtyp r2inv=ucl_recip(rsq);
-        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
-        numtyp rexp = (numtyp)0.0;
-        
-        if (rsq < cut_coulsq) {
-          numtyp r=ucl_rsqrt(r2inv);
-          numtyp grij = g_ewald * r;
-          numtyp expm2 = ucl_exp(-grij*grij);
-          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          fetch(prefactor,j,q_tex);
-          prefactor *= qqrd2e * qtmp/r;
-          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else forcecoul = (numtyp)0.0;
-
-        if (rsq < cutsq_sigma[mtype].y) {
-          numtyp r = ucl_sqrt(rsq);
-          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
-          r6inv = r2inv*r2inv*r2inv;
-          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
+            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
+        } else forceborn = (numtyp)0.0;
+
+        force = (forceborn + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < coeff1[mtype].w) {
+            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+              + coeff2[mtype].z*r2inv*r6inv;
+            energy+=factor_lj*(e-coeff2[mtype].w);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
+                               const __global numtyp4 *restrict coeff1_in,
+                               const __global numtyp4 *restrict coeff2_in,
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor,
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch,
+                               const __global numtyp *restrict q_,
+                               const __global numtyp4 *restrict cutsq_sigma,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    coeff1[tid]=coeff1_in[tid];
+    if (eflag>0)
+      coeff2[tid]=coeff2_in[tid];
+  }
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    __local int n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq_sigma[mtype].x) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
+        numtyp rexp = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r=ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else forcecoul = (numtyp)0.0;
+
+        if (rsq < cutsq_sigma[mtype].y) {
+          numtyp r = ucl_sqrt(rsq);
+          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
+          r6inv = r2inv*r2inv*r2inv;
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
             + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
         } else forceborn = (numtyp)0.0;
 
diff --git a/lib/gpu/lal_born_coul_long.h b/lib/gpu/lal_born_coul_long.h
index 4dc5021f03..e0de27c71c 100644
--- a/lib/gpu/lal_born_coul_long.h
+++ b/lib/gpu/lal_born_coul_long.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,19 +30,19 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
-           double **host_born1, double **host_born2, double **host_born3, 
-           double **host_a, double **host_c, double **host_d, 
+  int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
+           double **host_born1, double **host_born2, double **host_born3,
+           double **host_a, double **host_c, double **host_d,
            double **host_sigma, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const double qqrd2e, const double g_ewald);
@@ -59,12 +59,12 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
 
   // --------------------------- TYPE DATA --------------------------
 
-  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, 
+  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
   /// coeff1.w = born3
   UCL_D_Vec<numtyp4> coeff1;
   /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
   UCL_D_Vec<numtyp4> coeff2;
-  /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, 
+  /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj,
   /// cutsq_sigma.z = sigma
   UCL_D_Vec<numtyp4> cutsq_sigma;
   /// Special LJ values [0-3] and Special Coul values [4-7]
@@ -73,7 +73,7 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _cut_coulsq, _qqrd2e, _g_ewald;
diff --git a/lib/gpu/lal_born_coul_long_ext.cpp b/lib/gpu/lal_born_coul_long_ext.cpp
index 382e9a2b2c..feb7472e74 100644
--- a/lib/gpu/lal_born_coul_long_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,9 +30,9 @@ static BornCoulLong<PRECISION,ACC_PRECISION> BORNCLMF;
 int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                     double **host_born1, double **host_born2, double **host_born3,
                     double **host_a, double **host_c, double **host_d,
-                    double **sigma, double **offset, double *special_lj, 
-                    const int inum, const int nall, const int max_nbors, 
-                    const int maxspecial, const double cell_size, int &gpu_mode, 
+                    double **sigma, double **offset, double *special_lj,
+                    const int inum, const int nall, const int max_nbors,
+                    const int maxspecial, const double cell_size, int &gpu_mode,
                     FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
                     double *host_special_coul, const double qqrd2e,
                     const double g_ewald) {
@@ -58,10 +58,10 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
-                          host_born3, host_a, host_c, host_d, sigma, offset, 
-                          special_lj, inum, nall, 300, maxspecial, cell_size, 
-                          gpu_split, screen, host_cut_ljsq, host_cut_coulsq, 
+    init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                          host_born3, host_a, host_c, host_d, sigma, offset,
+                          special_lj, inum, nall, 300, maxspecial, cell_size,
+                          gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                           host_special_coul, qqrd2e, g_ewald);
 
   BORNCLMF.device->world_barrier();
@@ -78,14 +78,14 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
-                            host_born3, host_a, host_c, host_d, sigma, offset, 
-                            special_lj, inum, nall, 300, maxspecial, cell_size, 
-                            gpu_split, screen, host_cut_ljsq, host_cut_coulsq, 
+      init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                            host_born3, host_a, host_c, host_d, sigma, offset,
+                            special_lj, inum, nall, 300, maxspecial, cell_size,
+                            gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                             host_special_coul, qqrd2e, g_ewald);
 
     BORNCLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,7 +102,7 @@ void borncl_gpu_clear() {
 
 int** borncl_gpu_compute_n(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
                            tagint **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum,  const double cpu_time,
@@ -112,8 +112,8 @@ int** borncl_gpu_compute_n(const int ago, const int inum_full,
                           subhi, tag, nspecial, special, eflag, vflag, eatom,
                           vatom, host_start, ilist, jnum, cpu_time, success,
                           host_q, boxlo, prd);
-}  
-			
+}
+
 void borncl_gpu_compute(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, int *ilist, int *numj,
                         int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_born_coul_wolf.cpp b/lib/gpu/lal_born_coul_wolf.cpp
index 7615c1dd53..7ebd7b744f 100644
--- a/lib/gpu/lal_born_coul_wolf.cpp
+++ b/lib/gpu/lal_born_coul_wolf.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -37,17 +37,17 @@ template <class numtyp, class acctyp>
 BornCoulWolfT::~BornCoulWolfT() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BornCoulWolfT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
-                        double **host_born1, double **host_born2, double **host_born3, 
-                        double **host_a, double **host_c, double **host_d, 
-                        double **host_sigma, double **host_offset, 
+int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
+                        double **host_born1, double **host_born2, double **host_born3,
+                        double **host_a, double **host_c, double **host_d,
+                        double **host_sigma, double **host_offset,
                         double *host_special_lj, const int nlocal,
                         const int nall, const int max_nbors,
                         const int maxspecial, const double cell_size,
@@ -84,12 +84,12 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho
 
   coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		                     host_d,host_offset);
-  
+                                     host_d,host_offset);
+
   cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
                          host_cut_ljsq,host_sigma);
-  
+
   sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
     host_write[i]=host_special_lj[i];
@@ -144,7 +144,7 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -157,17 +157,17 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->atom->q,
-                          &cutsq_sigma, &_cut_coulsq, &_qqrd2e, 
-                          &_alf, &_e_shift, &_f_shift, 
+                          &cutsq_sigma, &_cut_coulsq, &_qqrd2e,
+                          &_alf, &_e_shift, &_f_shift,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, 
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                    &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                    &nbor_pitch, &this->atom->q,
                    &cutsq_sigma, &_cut_coulsq,
-                   &_qqrd2e, &_alf, &_e_shift, &_f_shift, 
+                   &_qqrd2e, &_alf, &_e_shift, &_f_shift,
                    &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_born_coul_wolf.cu b/lib/gpu/lal_born_coul_wolf.cu
index e7706b408a..0dc7d08c63 100644
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -31,21 +31,21 @@ texture<int2> q_tex;
 
 #define MY_PIS (acctyp)1.77245385090551602729
 
-__kernel void k_born_wolf(const __global numtyp4 *restrict x_, 
+__kernel void k_born_wolf(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict coeff1,
-                          const __global numtyp4 *restrict coeff2, 
-                          const int lj_types, 
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
+                          const __global numtyp4 *restrict coeff2,
+                          const int lj_types,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
+                          __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
-                          const int nbor_pitch, 
+                          const int nbor_pitch,
                           const __global numtyp *restrict q_,
-                          const __global numtyp4 *restrict cutsq_sigma, 
+                          const __global numtyp4 *restrict cutsq_sigma,
                           const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp alf, const numtyp e_shift, 
+                          const numtyp alf, const numtyp e_shift,
                           const numtyp f_shift, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -67,20 +67,20 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
     if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * 
+      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
     }
@@ -108,12 +108,12 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
         numtyp forcecoul, forceborn, force, r6inv, prefactor;
         numtyp v_sh = (numtyp)0.0;
         numtyp rexp = (numtyp)0.0;
-        
+
         if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
           numtyp r = ucl_sqrt(rsq);
           rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
-          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
             + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
         } else forceborn = (numtyp)0.0;
 
@@ -147,7 +147,7 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
               + coeff2[mtype].z*r2inv*r6inv;
             energy+=factor_lj*(e-coeff2[mtype].w);
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -165,20 +165,20 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp4 *restrict coeff1_in,
-                               const __global numtyp4 *restrict coeff2_in, 
+                               const __global numtyp4 *restrict coeff2_in,
                                const __global numtyp *restrict sp_lj_in,
-                               const __global int *dev_nbor, 
+                               const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans, 
-                               __global acctyp *restrict engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, 
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch,
                                const __global numtyp *restrict q_,
                                const __global numtyp4 *restrict cutsq_sigma,
                                const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp alf, const numtyp e_shift, 
+                               const numtyp alf, const numtyp e_shift,
                                const numtyp f_shift, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -193,7 +193,7 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       coeff2[tid]=coeff2_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -201,23 +201,23 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * 
+      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
     }
@@ -244,12 +244,12 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
         numtyp forcecoul, forceborn, force, r6inv, prefactor;
         numtyp v_sh = (numtyp)0.0;
         numtyp rexp = (numtyp)0.0;
-        
+
         if (rsq < cutsq_sigma[mtype].y) {
           numtyp r = ucl_sqrt(rsq);
           rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
-          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
             + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
         } else forceborn = (numtyp)0.0;
 
diff --git a/lib/gpu/lal_born_coul_wolf.h b/lib/gpu/lal_born_coul_wolf.h
index 9e02d23233..4b2406b989 100644
--- a/lib/gpu/lal_born_coul_wolf.h
+++ b/lib/gpu/lal_born_coul_wolf.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,19 +30,19 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
-           double **host_born1, double **host_born2, double **host_born3, 
-           double **host_a, double **host_c, double **host_d, 
+  int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
+           double **host_born1, double **host_born2, double **host_born3,
+           double **host_a, double **host_c, double **host_d,
            double **host_sigma, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const double qqrd2e, const double alf, const double e_shift,
@@ -60,12 +60,12 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
 
   // --------------------------- TYPE DATA --------------------------
 
-  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, 
+  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
   /// coeff1.w = born3
   UCL_D_Vec<numtyp4> coeff1;
   /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
   UCL_D_Vec<numtyp4> coeff2;
-  /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, 
+  /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj,
   /// cutsq_sigma.z = sigma
   UCL_D_Vec<numtyp4> cutsq_sigma;
   /// Special LJ values [0-3] and Special Coul values [4-7]
@@ -74,7 +74,7 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _cut_coulsq,_qqrd2e,_alf,_e_shift,_f_shift;
diff --git a/lib/gpu/lal_born_coul_wolf_ext.cpp b/lib/gpu/lal_born_coul_wolf_ext.cpp
index b56c526119..254b1c905b 100644
--- a/lib/gpu/lal_born_coul_wolf_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -28,7 +28,7 @@ static BornCoulWolf<PRECISION,ACC_PRECISION> BORNCWMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                    double **host_born1, double **host_born2, double **host_born3, 
+                    double **host_born1, double **host_born2, double **host_born3,
                     double **host_a, double **host_c, double **host_d,
                     double **sigma, double **offset, double *special_lj, const int inum,
                     const int nall, const int max_nbors, const int maxspecial,
@@ -60,9 +60,9 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   if (world_me==0)
     init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                           host_born3, host_a, host_c, host_d, sigma,
-                          offset, special_lj, inum, nall, 300, 
+                          offset, special_lj, inum, nall, 300,
                           maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
-                          host_cut_coulsq, host_special_coul, qqrd2e, 
+                          host_cut_coulsq, host_special_coul, qqrd2e,
                           alf, e_shift, f_shift);
 
   BORNCWMF.device->world_barrier();
@@ -79,15 +79,15 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
-                            host_born3, host_a, host_c, host_d, sigma, 
-                            offset, special_lj, inum, nall, 300, 
+      init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                            host_born3, host_a, host_c, host_d, sigma,
+                            offset, special_lj, inum, nall, 300,
                             maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
-                            host_cut_coulsq, host_special_coul, qqrd2e, 
+                            host_cut_coulsq, host_special_coul, qqrd2e,
                             alf, e_shift, f_shift);
 
     BORNCWMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -104,7 +104,7 @@ void borncw_gpu_clear() {
 
 int** borncw_gpu_compute_n(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
                            tagint **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum,  const double cpu_time,
@@ -114,8 +114,8 @@ int** borncw_gpu_compute_n(const int ago, const int inum_full,
                           subhi, tag, nspecial, special, eflag, vflag, eatom,
                           vatom, host_start, ilist, jnum, cpu_time, success,
                           host_q, boxlo, prd);
-}  
-			
+}
+
 void borncw_gpu_compute(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, int *ilist, int *numj,
                         int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_born_ext.cpp b/lib/gpu/lal_born_ext.cpp
index 6bd51e6d68..b1ebf5804c 100644
--- a/lib/gpu/lal_born_ext.cpp
+++ b/lib/gpu/lal_born_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -28,9 +28,9 @@ static Born<PRECISION,ACC_PRECISION> BORNMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                  double **host_born1, double **host_born2, 
-                  double **host_born3, double **host_a, double **host_c, 
-                  double **host_d, double **sigma,      
+                  double **host_born1, double **host_born2,
+                  double **host_born3, double **host_a, double **host_c,
+                  double **host_d, double **sigma,
                   double **offset, double *special_lj, const int inum,
                   const int nall, const int max_nbors,  const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen) {
@@ -56,7 +56,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
+    init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                         host_born3, host_a, host_c, host_d, sigma,
                         offset, special_lj, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen);
@@ -75,13 +75,13 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
-                          host_born3, host_a, host_c, host_d, sigma, 
+      init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                          host_born3, host_a, host_c, host_d, sigma,
                           offset, special_lj, inum, nall, 300,
                           maxspecial, cell_size, gpu_split, screen);
 
     BORNMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,24 +102,24 @@ void born_gpu_reinit(const int ntypes, double **host_rhoinv,
   int world_me=BORNMF.device->world_me();
   int gpu_rank=BORNMF.device->gpu_rank();
   int procs_per_gpu=BORNMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2,
                   host_born3, host_a, host_c, host_d, offset);
-  
+
   BORNMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2,
                     host_born3, host_a, host_c, host_d, offset);
-    
+
     BORNMF.device->gpu_barrier();
   }
 }
 
 void born_gpu_clear() {
-  BORNMF.clear(); 
+  BORNMF.clear();
 }
 
 int ** born_gpu_compute_n(const int ago, const int inum_full,
@@ -132,8 +132,8 @@ int ** born_gpu_compute_n(const int ago, const int inum_full,
   return BORNMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void born_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_buck.cpp b/lib/gpu/lal_buck.cpp
index f66759ee3a..0da4068d51 100644
--- a/lib/gpu/lal_buck.cpp
+++ b/lib/gpu/lal_buck.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,10 +33,10 @@ BuckT::Buck() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-BuckT::~Buck() { 
+BuckT::~Buck() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BuckT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -44,11 +44,11 @@ int BuckT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int BuckT::init(const int ntypes, double **host_cutsq,
-           double **host_rhoinv, double **host_buck1, double **host_buck2, 
-           double **host_a, double **host_c, 
+           double **host_rhoinv, double **host_buck1, double **host_buck2,
+           double **host_a, double **host_c,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@@ -79,7 +79,7 @@ int BuckT::init(const int ntypes, double **host_cutsq,
 
   coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		         host_offset);
+                         host_offset);
 
   UCL_H_Vec<double> dview;
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@@ -95,14 +95,14 @@ template <class numtyp, class acctyp>
 void BuckT::reinit(const int ntypes, double **host_cutsq,
                    double **host_rhoinv, double **host_buck1, double **host_buck2,
                    double **host_a, double **host_c, double **host_offset) {
-  
+
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
                          host_buck1,host_buck2,host_cutsq);
   this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
@@ -143,7 +143,7 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -154,13 +154,13 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
-                          &vflag, &ainum, &nbor_pitch, 
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu
index 955547e598..c1e1c7d7e2 100644
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -24,15 +24,15 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_buck(const __global numtyp4 *restrict x_, 
+__kernel void k_buck(const __global numtyp4 *restrict x_,
                      const __global numtyp4 *restrict coeff1,
-                     const __global numtyp4 *restrict coeff2, 
-                     const int lj_types, 
+                     const __global numtyp4 *restrict coeff2,
+                     const int lj_types,
                      const __global numtyp *restrict sp_lj_in,
-                     const __global int *dev_nbor, 
+                     const __global int *dev_nbor,
                      const __global int *dev_packed,
                      __global acctyp4 *restrict ans,
-                     __global acctyp *restrict engv, 
+                     __global acctyp *restrict engv,
                      const int eflag,  const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -50,20 +50,20 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -76,24 +76,24 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (r2inv<coeff1[mtype].w) {
         numtyp r=ucl_sqrt(r2inv);
         numtyp rexp = ucl_exp(-r*coeff1[mtype].x);
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
+        numtyp force = r2inv*(coeff1[mtype].y*r*rexp
                 - coeff1[mtype].z*r6inv);
         force*=factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
-          energy+=factor_lj*(e-coeff2[mtype].z); 
+          energy+=factor_lj*(e-coeff2[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -111,19 +111,19 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_buck_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_buck_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict coeff1_in,
-                          const __global numtyp4 *restrict coeff2_in, 
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
+                          const __global numtyp4 *restrict coeff2_in,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
-                          const int eflag, const int vflag, const int inum, 
+                          __global acctyp *restrict engv,
+                          const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
@@ -134,7 +134,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       coeff2[tid]=coeff2_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -143,7 +143,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -157,7 +157,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -170,13 +170,13 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       if (r2inv<coeff1[mtype].w) {
         numtyp r=ucl_sqrt(r2inv);
         numtyp rexp = ucl_exp(-r*coeff1[mtype].x);
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
+        numtyp force = r2inv*(coeff1[mtype].y*r*rexp
                 - coeff1[mtype].z*r6inv);
         force*=factor_lj;
 
@@ -186,7 +186,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
 
         if (eflag>0) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
-          energy+=factor_lj*(e-coeff2[mtype].z); 
+          energy+=factor_lj*(e-coeff2[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_buck.h b/lib/gpu/lal_buck.h
index ebcd72d990..3b84066355 100644
--- a/lib/gpu/lal_buck.h
+++ b/lib/gpu/lal_buck.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Buck : public BaseAtomic<numtyp, acctyp> {
  public:
   Buck();
-  ~Buck(); 
+  ~Buck();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -38,18 +38,18 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq,
-           double **host_rhoinv, double **host_buck1, double **host_buck2, 
-           double **host_a, double **host_c, 
+           double **host_rhoinv, double **host_buck1, double **host_buck2,
+           double **host_a, double **host_c,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **host_cutsq,
             double **host_rhoinv, double **host_buck1, double **host_buck2,
             double **host_a, double **host_c, double **host_offset);
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -72,7 +72,7 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_buck_coul.cpp b/lib/gpu/lal_buck_coul.cpp
index bec640e7a6..e4f829fc5c 100644
--- a/lib/gpu/lal_buck_coul.cpp
+++ b/lib/gpu/lal_buck_coul.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,10 +33,10 @@ BuckCoulT::BuckCoul() : BaseCharge<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-BuckCoulT::~BuckCoul() { 
+BuckCoulT::~BuckCoul() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BuckCoulT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -44,11 +44,11 @@ int BuckCoulT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int BuckCoulT::init(const int ntypes, double **host_cutsq,
-                   double **host_rhoinv, double **host_buck1, double **host_buck2, 
-                   double **host_a, double **host_c, 
+                   double **host_rhoinv, double **host_buck1, double **host_buck2,
+                   double **host_a, double **host_c,
                    double **host_offset, double *host_special_lj,
-                   const int nlocal, const int nall, const int max_nbors, 
-                   const int maxspecial, const double cell_size, 
+                   const int nlocal, const int nall, const int max_nbors,
+                   const int maxspecial, const double cell_size,
                    const double gpu_split, FILE *_screen, double **host_cut_ljsq,
                    double **host_cut_coulsq, double *host_special_coul,
                    const double qqrd2e) {
@@ -81,21 +81,21 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,
 
   coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		         host_offset);
-  
+                         host_offset);
+
   cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,cutsq,host_write,host_cutsq,
             host_cut_ljsq, host_cut_coulsq);
-  
+
   sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
     host_write[i]=host_special_lj[i];
     host_write[i+4]=host_special_coul[i];
   }
   ucl_copy(sp_lj,host_write,8,false);
-  
+
   _qqrd2e = qqrd2e;
-  
+
   _allocated=true;
   this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()+sp_lj.row_bytes();
   return 0;
@@ -135,7 +135,7 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -147,12 +147,12 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch, &this->atom->q, 
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
                           &cutsq, &_qqrd2e, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->atom->q,
                      &cutsq, &_qqrd2e, &this->_threads_per_atom);
diff --git a/lib/gpu/lal_buck_coul.cu b/lib/gpu/lal_buck_coul.cu
index 87604a02ea..6f0d414825 100644
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -29,19 +29,19 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_buck_coul(const __global numtyp4 *restrict x_, 
+__kernel void k_buck_coul(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict coeff1,
-                          const __global numtyp4 *restrict coeff2, 
-                          const int lj_types, 
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
+                          const __global numtyp4 *restrict coeff2,
+                          const int lj_types,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
+                          __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
-                          const int nbor_pitch, 
+                          const int nbor_pitch,
                           const __global numtyp *restrict q_ ,
-                          const __global numtyp4 *restrict cutsq, 
+                          const __global numtyp4 *restrict cutsq,
                           const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -63,21 +63,21 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
-    
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
-      
+
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
       factor_coul = sp_lj[sbmask(j)+4];
@@ -91,30 +91,30 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<cutsq[mtype].x) {
         numtyp r2inv=ucl_recip(rsq);
         numtyp forcecoul, forcebuck, force, r6inv;
         numtyp rexp = (numtyp)0.0;
-        
+
         if (rsq < cutsq[mtype].y) { // buckingham
           numtyp r=ucl_sqrt(rsq);
           rexp = ucl_exp(-r*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
-          forcebuck = (coeff1[mtype].y*r*rexp 
+          forcebuck = (coeff1[mtype].y*r*rexp
                   - coeff1[mtype].z*r6inv)*factor_lj;
         } else
           forcebuck = (numtyp)0.0;
-        
+
         if (rsq < coeff2[mtype].z) {
           fetch(forcecoul,j,q_tex);
           forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
         } else
           forcecoul = (numtyp)0.0;
-        
+
         force = (forcebuck + forcecoul) * r2inv;
-        
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -142,22 +142,22 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp4 *restrict coeff1_in,
-                               const __global numtyp4 *restrict coeff2_in, 
-                               const __global numtyp *restrict sp_lj_in, 
-                               const __global int *dev_nbor, 
-                               const __global int *dev_packed, 
-                               __global acctyp4 *restrict ans, 
-                               __global acctyp *restrict engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, 
+                               const __global numtyp4 *restrict coeff2_in,
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor,
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch,
                                const __global numtyp *restrict q_,
-                               const __global numtyp4 *restrict _cutsq, 
+                               const __global numtyp4 *restrict _cutsq,
                                const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@@ -170,7 +170,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       coeff2[tid]=coeff2_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -180,7 +180,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -195,7 +195,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
-      
+
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
       factor_coul = sp_lj[sbmask(j)+4];
@@ -209,27 +209,27 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<cutsq[mtype].x) {
         numtyp r2inv=ucl_recip(rsq);
         numtyp forcecoul, forcebuck, force, r6inv;
         numtyp rexp = (numtyp)0.0;
-        
+
         if (rsq < cutsq[mtype].y) { // buckingham
           numtyp r=ucl_sqrt(rsq);
           rexp = ucl_exp(-r*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
-          forcebuck = (coeff1[mtype].y*r*rexp 
+          forcebuck = (coeff1[mtype].y*r*rexp
                   - coeff1[mtype].z*r6inv)*factor_lj;
         } else
           forcebuck = (numtyp)0.0;
-        
+
         if (rsq < cutsq[mtype].z) {
           fetch(forcecoul,j,q_tex);
           forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
         } else
           forcecoul = (numtyp)0.0;
-        
+
         force = (forcebuck + forcecoul) * r2inv;
 
         f.x+=delx*force;
@@ -241,7 +241,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
           if (rsq < cutsq[mtype].y) {
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
             energy+=factor_lj*(e-coeff2[mtype].z);
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_buck_coul.h b/lib/gpu/lal_buck_coul.h
index e4bf59107c..3f8428bfe1 100644
--- a/lib/gpu/lal_buck_coul.h
+++ b/lib/gpu/lal_buck_coul.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class BuckCoul : public BaseCharge<numtyp, acctyp> {
  public:
   BuckCoul();
-  ~BuckCoul(); 
+  ~BuckCoul();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -38,11 +38,11 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq,
-           double **host_rhoinv, double **host_buck1, double **host_buck2, 
-           double **host_a, double **host_c, 
+           double **host_rhoinv, double **host_buck1, double **host_buck2,
+           double **host_a, double **host_c,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            double **host_cut_coulsq, double *host_special_coul,
            const double qqrd2e);
@@ -71,11 +71,11 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
-  
+
   numtyp _qqrd2e;
-  
+
  private:
   bool _allocated;
   void loop(const bool _eflag, const bool _vflag);
diff --git a/lib/gpu/lal_buck_coul_ext.cpp b/lib/gpu/lal_buck_coul_ext.cpp
index dd696fc6bb..e5a5e1315b 100644
--- a/lib/gpu/lal_buck_coul_ext.cpp
+++ b/lib/gpu/lal_buck_coul_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -28,8 +28,8 @@ static BuckCoul<PRECISION,ACC_PRECISION> BUCKCMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                 double **host_buck1, double **host_buck2, 
-                 double **host_a, double **host_c,       
+                 double **host_buck1, double **host_buck2,
+                 double **host_a, double **host_c,
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors,  const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen,
@@ -57,9 +57,9 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
+    init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                        host_a, host_c, offset, special_lj, inum, nall, 300,
-                       maxspecial, cell_size, gpu_split, screen, 
+                       maxspecial, cell_size, gpu_split, screen,
                        host_cut_ljsq, host_cut_coulsq,
                        host_special_coul, qqrd2e);
 
@@ -77,14 +77,14 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
+      init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                        host_a, host_c, offset, special_lj, inum, nall, 300,
-                       maxspecial, cell_size, gpu_split, screen, 
+                       maxspecial, cell_size, gpu_split, screen,
                        host_cut_ljsq, host_cut_coulsq,
                        host_special_coul, qqrd2e);
 
     BUCKCMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -96,12 +96,12 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
 }
 
 void buckc_gpu_clear() {
-  BUCKCMF.clear(); 
+  BUCKCMF.clear();
 }
 
 int ** buckc_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                        double *sublo, double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
@@ -111,8 +111,8 @@ int ** buckc_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
-}  
-			
+}
+
 void buckc_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_buck_coul_long.cpp b/lib/gpu/lal_buck_coul_long.cpp
index 4aa720132a..81faada116 100644
--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 BuckCoulLongT::~BuckCoulLongT() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BuckCoulLongT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,8 +45,8 @@ int BuckCoulLongT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
-                       double **host_rhoinv, double **host_buck1, double **host_buck2, 
-                       double **host_a, double **host_c, double **host_offset, 
+                       double **host_rhoinv, double **host_buck1, double **host_buck2,
+                       double **host_a, double **host_c, double **host_offset,
                        double *host_special_lj, const int nlocal,
                        const int nall, const int max_nbors,
                        const int maxspecial, const double cell_size,
@@ -83,11 +83,11 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
 
   coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		         host_offset);
-  
+                         host_offset);
+
   cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
-  
+
   sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
     host_write[i]=host_special_lj[i];
@@ -139,7 +139,7 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -150,16 +150,16 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
-                          &cutsq, &_cut_coulsq, &_qqrd2e, 
+                          &cutsq, &_cut_coulsq, &_qqrd2e,
                           &_g_ewald, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &coeff1,  &coeff2, &_lj_types, &sp_lj, 
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &coeff1,  &coeff2, &_lj_types, &sp_lj,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                    &this->ans->force, &this->ans->engv, &eflag, &vflag,
-                   &ainum, &nbor_pitch, &this->atom->q, &cutsq, 
+                   &ainum, &nbor_pitch, &this->atom->q, &cutsq,
                    &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_buck_coul_long.cu b/lib/gpu/lal_buck_coul_long.cu
index fc68d12471..da3237a31f 100644
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -29,19 +29,19 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, 
+__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
                                const __global numtyp4 *restrict coeff1,
-                               const __global numtyp4 *restrict coeff2, 
-                               const int lj_types, 
-                               const __global numtyp *restrict sp_lj_in, 
-                               const __global int *dev_nbor, 
-                               const __global int *dev_packed, 
+                               const __global numtyp4 *restrict coeff2,
+                               const int lj_types,
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor,
+                               const __global int *dev_packed,
                                __global acctyp4 *restrict ans,
-                               __global acctyp *restrict engv, 
+                               __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
-                               const int nbor_pitch, 
+                               const int nbor_pitch,
                                const __global numtyp *restrict q_,
-                               const __global numtyp *restrict cutsq, 
+                               const __global numtyp *restrict cutsq,
                                const numtyp cut_coulsq, const numtyp qqrd2e,
                                const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
@@ -64,14 +64,14 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -98,136 +98,136 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
         numtyp r2inv=ucl_recip(rsq);
         numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
         numtyp rexp = (numtyp)0.0;
-        
+
         if (rsq < coeff1[mtype].w) { // cut_ljsq
           numtyp r=ucl_sqrt(rsq);
           rexp = ucl_exp(-r*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
-          force_lj = (coeff1[mtype].y*r*rexp 
-                  - coeff1[mtype].z*r6inv)*factor_lj;
-        } else
-          force_lj = (numtyp)0.0;
-
-        if (rsq < cut_coulsq) {
-          numtyp r = ucl_rsqrt(r2inv);
-          numtyp grij = g_ewald * r;
-          numtyp expm2 = ucl_exp(-grij*grij);
-          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          fetch(prefactor,j,q_tex);
-          prefactor *= qqrd2e * qtmp/r;
-          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else
-          forcecoul = (numtyp)0.0;
-
-        force = (force_lj + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < coeff1[mtype].w) {
-            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
-            energy+=factor_lj*(e-coeff2[mtype].z);
-          } 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
-  } // if ii
-}
-
-__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_, 
-                                    const __global numtyp4 *restrict coeff1_in,
-                                    const __global numtyp4 *restrict coeff2_in, 
-                                    const __global numtyp *restrict sp_lj_in,
-                                    const __global int *dev_nbor, 
-                                    const __global int *dev_packed,
-                                    __global acctyp4 *restrict ans,
-                                    __global acctyp *restrict engv, 
-                                    const int eflag, const int vflag, 
-                                    const int inum, const int nbor_pitch,
-                                    const __global numtyp *restrict q_, 
-                                    const __global numtyp *restrict cutsq,
-                                    const numtyp cut_coulsq, 
-                                    const numtyp qqrd2e, const numtyp g_ewald, 
-                                    const int t_per_atom) {
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[8];
-  if (tid<8)
-    sp_lj[tid]=sp_lj_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
-      coeff2[tid]=coeff2_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    __local int n_stride;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-  
-    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    numtyp qtmp; fetch(qtmp,i,q_tex);
-    int iw=ix.w;
-    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-      int j=dev_packed[nbor];
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq<cutsq[mtype]) {
-        numtyp r2inv=ucl_recip(rsq);
-        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
-        numtyp rexp = (numtyp)0.0;
-        
-        if (rsq < coeff1[mtype].w) {
-          numtyp r=ucl_sqrt(rsq);
-          rexp = ucl_exp(-r*coeff1[mtype].x);
-          r6inv = r2inv*r2inv*r2inv;
-          force_lj = (coeff1[mtype].y*r*rexp 
+          force_lj = (coeff1[mtype].y*r*rexp
+                  - coeff1[mtype].z*r6inv)*factor_lj;
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < coeff1[mtype].w) {
+            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
+            energy+=factor_lj*(e-coeff2[mtype].z);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
+                                    const __global numtyp4 *restrict coeff1_in,
+                                    const __global numtyp4 *restrict coeff2_in,
+                                    const __global numtyp *restrict sp_lj_in,
+                                    const __global int *dev_nbor,
+                                    const __global int *dev_packed,
+                                    __global acctyp4 *restrict ans,
+                                    __global acctyp *restrict engv,
+                                    const int eflag, const int vflag,
+                                    const int inum, const int nbor_pitch,
+                                    const __global numtyp *restrict q_,
+                                    const __global numtyp *restrict cutsq,
+                                    const numtyp cut_coulsq,
+                                    const numtyp qqrd2e, const numtyp g_ewald,
+                                    const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    coeff1[tid]=coeff1_in[tid];
+    if (eflag>0)
+      coeff2[tid]=coeff2_in[tid];
+  }
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    __local int n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq[mtype]) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
+        numtyp rexp = (numtyp)0.0;
+
+        if (rsq < coeff1[mtype].w) {
+          numtyp r=ucl_sqrt(rsq);
+          rexp = ucl_exp(-r*coeff1[mtype].x);
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = (coeff1[mtype].y*r*rexp
                   - coeff1[mtype].z*r6inv)*factor_lj;
         } else
           force_lj = (numtyp)0.0;
diff --git a/lib/gpu/lal_buck_coul_long.h b/lib/gpu/lal_buck_coul_long.h
index dc59d7ad4e..4a70a3a097 100644
--- a/lib/gpu/lal_buck_coul_long.h
+++ b/lib/gpu/lal_buck_coul_long.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -38,11 +38,11 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq,
-           double **host_rhoinv, double **host_buck1, double **host_buck2, 
-           double **host_a, double **host_c, 
+           double **host_rhoinv, double **host_buck1, double **host_buck2,
+           double **host_a, double **host_c,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const double qqrd2e, const double g_ewald);
@@ -71,7 +71,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _cut_coulsq, _qqrd2e, _g_ewald;
diff --git a/lib/gpu/lal_buck_coul_long_ext.cpp b/lib/gpu/lal_buck_coul_long_ext.cpp
index 9c0c331ee1..28a89746b3 100644
--- a/lib/gpu/lal_buck_coul_long_ext.cpp
+++ b/lib/gpu/lal_buck_coul_long_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -28,7 +28,7 @@ static BuckCoulLong<PRECISION,ACC_PRECISION> BUCKCLMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                 double **host_buck1, double **host_buck2, 
+                 double **host_buck1, double **host_buck2,
                  double **host_a, double **host_c,
                   double **offset, double *special_lj, const int inum,
                   const int nall, const int max_nbors, const int maxspecial,
@@ -58,8 +58,8 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
-                        host_a, host_c, offset, special_lj, inum, nall, 300, 
+    init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
+                        host_a, host_c, offset, special_lj, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
@@ -77,13 +77,13 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
-                        host_a, host_c, offset, special_lj, inum, nall, 300, 
+      init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
+                        host_a, host_c, offset, special_lj, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
     BUCKCLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -100,7 +100,7 @@ void buckcl_gpu_clear() {
 
 int** buckcl_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
@@ -110,8 +110,8 @@ int** buckcl_gpu_compute_n(const int ago, const int inum_full,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
-}  
-			
+}
+
 void buckcl_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_buck_ext.cpp b/lib/gpu/lal_buck_ext.cpp
index 75c88e8dbe..336aab6d4c 100644
--- a/lib/gpu/lal_buck_ext.cpp
+++ b/lib/gpu/lal_buck_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -28,8 +28,8 @@ static Buck<PRECISION,ACC_PRECISION> BUCKMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                 double **host_buck1, double **host_buck2, 
-                 double **host_a, double **host_c,       
+                 double **host_buck1, double **host_buck2,
+                 double **host_a, double **host_c,
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors,  const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen) {
@@ -55,7 +55,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
+    init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                        host_a, host_c, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
@@ -73,12 +73,12 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
+      init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                        host_a, host_c, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
     BUCKMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -98,24 +98,24 @@ void buck_gpu_reinit(const int ntypes, double **cutsq, double **host_rhoinv,
   int world_me=BUCKMF.device->world_me();
   int gpu_rank=BUCKMF.device->gpu_rank();
   int procs_per_gpu=BUCKMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                   host_a, host_c, offset);
-  
+
   BUCKMF.device->world_barrier();
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                     host_a, host_c, offset);
-    
+
     BUCKMF.device->gpu_barrier();
   }
 }
 
 void buck_gpu_clear() {
-  BUCKMF.clear(); 
+  BUCKMF.clear();
 }
 
 int ** buck_gpu_compute_n(const int ago, const int inum_full,
@@ -128,8 +128,8 @@ int ** buck_gpu_compute_n(const int ago, const int inum_full,
   return BUCKMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void buck_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_cg_cmm.cpp b/lib/gpu/lal_cg_cmm.cpp
index 96455888f0..d361e32b09 100644
--- a/lib/gpu/lal_cg_cmm.cpp
+++ b/lib/gpu/lal_cg_cmm.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -33,23 +33,23 @@ CGCMMT::CGCMM() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-CGCMMT::~CGCMM() { 
+CGCMMT::~CGCMM() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CGCMMT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int CGCMMT::init(const int ntypes, double **host_cutsq, 
-                          int **host_cg_type, double **host_lj1, 
-                          double **host_lj2, double **host_lj3, 
-                          double **host_lj4, double **host_offset, 
+int CGCMMT::init(const int ntypes, double **host_cutsq,
+                          int **host_cg_type, double **host_lj1,
+                          double **host_lj2, double **host_lj3,
+                          double **host_lj4, double **host_offset,
                           double *host_special_lj, const int nlocal,
                           const int nall, const int max_nbors,
-                          const int maxspecial, const double cell_size, 
+                          const int maxspecial, const double cell_size,
                           const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@@ -75,12 +75,12 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,
     host_write[i]=0.0;
 
   lj1.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq, 
+  this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq,
                          host_cg_type,host_lj1,host_lj2);
 
   lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);
 
   UCL_H_Vec<double> dview;
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@@ -126,7 +126,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -138,7 +138,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch,  
+                          &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
diff --git a/lib/gpu/lal_cg_cmm.cu b/lib/gpu/lal_cg_cmm.cu
index 8f89f74d22..70d2ab6092 100644
--- a/lib/gpu/lal_cg_cmm.cu
+++ b/lib/gpu/lal_cg_cmm.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -24,15 +24,15 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_cg_cmm(const __global numtyp4 *restrict x_, 
+__kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
                        const __global numtyp4 *restrict lj1,
-                       const __global numtyp4 *restrict lj3, 
-                       const int lj_types, 
+                       const __global numtyp4 *restrict lj3,
+                       const int lj_types,
                        const __global numtyp *restrict sp_lj_in,
-                       const __global int *dev_nbor, 
+                       const __global int *dev_nbor,
                        const __global int *dev_packed,
                        __global acctyp4 *restrict ans,
-                       __global acctyp *restrict engv, 
+                       __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -50,20 +50,20 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -76,12 +76,12 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (r2inv<lj1[mtype].x) {
         r2inv=ucl_recip(r2inv);
         numtyp inv1,inv2;
-        
+
         if (lj1[mtype].y == 2) {
           inv1=r2inv*r2inv;
           inv2=inv1*inv1;
@@ -93,7 +93,7 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
           inv2=inv1;
         }
         numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -116,9 +116,9 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
                             const __global numtyp4 *restrict lj1_in,
-                            const __global numtyp4 *restrict lj3_in, 
+                            const __global numtyp4 *restrict lj3_in,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
@@ -139,30 +139,30 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -175,11 +175,11 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       if (r2inv<lj1[mtype].x) {
         r2inv=ucl_recip(r2inv);
         numtyp inv1,inv2;
-        
+
         if (lj1[mtype].y == (numtyp)2) {
           inv1=r2inv*r2inv;
           inv2=inv1*inv1;
@@ -191,7 +191,7 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
           inv2=inv1;
         }
         numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
diff --git a/lib/gpu/lal_cg_cmm.h b/lib/gpu/lal_cg_cmm.h
index 394cd81254..b7895b5898 100644
--- a/lib/gpu/lal_cg_cmm.h
+++ b/lib/gpu/lal_cg_cmm.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class CGCMM : public BaseAtomic<numtyp, acctyp> {
  public:
   CGCMM();
-  ~CGCMM(); 
+  ~CGCMM();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,7 +40,7 @@ class CGCMM : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, int **host_cg_type,
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
@@ -66,7 +66,7 @@ class CGCMM : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _cmm_types;
 
  private:
diff --git a/lib/gpu/lal_cg_cmm_ext.cpp b/lib/gpu/lal_cg_cmm_ext.cpp
index 0d2c3d8fbf..b6fc110b15 100644
--- a/lib/gpu/lal_cg_cmm_ext.cpp
+++ b/lib/gpu/lal_cg_cmm_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -28,9 +28,9 @@ static CGCMM<PRECISION,ACC_PRECISION> CMMMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
-                 double **host_lj1, double **host_lj2, double **host_lj3, 
+                 double **host_lj1, double **host_lj2, double **host_lj3,
                  double **host_lj4, double **offset, double *special_lj,
-                 const int inum, const int nall, const int max_nbors, 
+                 const int inum, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size, int &gpu_mode,
                  FILE *screen) {
   CMMMF.clear();
@@ -55,7 +55,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, 
+    init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
                        host_lj4, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
@@ -78,7 +78,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
                          maxspecial, cell_size, gpu_split, screen);
 
     CMMMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -103,8 +103,8 @@ int** cmm_gpu_compute_n(const int ago, const int inum_full,
   return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_cg_cmm_long.cpp b/lib/gpu/lal_cg_cmm_long.cpp
index 92e6bd04b5..14b5b7622c 100644
--- a/lib/gpu/lal_cg_cmm_long.cpp
+++ b/lib/gpu/lal_cg_cmm_long.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -37,22 +37,22 @@ template <class numtyp, class acctyp>
 CGCMMLongT::~CGCMMLong() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CGCMMLongT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int CGCMMLongT::init(const int ntypes, double **host_cutsq, 
-                           int **host_cg_type, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
+int CGCMMLongT::init(const int ntypes, double **host_cutsq,
+                           int **host_cg_type, double **host_lj1,
+                           double **host_lj2, double **host_lj3,
+                           double **host_lj4, double **host_offset,
                            double *host_special_lj, const int nlocal,
                            const int nall, const int max_nbors,
                            const int maxspecial, const double cell_size,
                            const double gpu_split, FILE *_screen,
-                           double **host_cut_ljsq, 
+                           double **host_cut_ljsq,
                            const double host_cut_coulsq,
                            double *host_special_coul, const double qqrd2e,
                            const double g_ewald) {
@@ -137,7 +137,7 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -149,13 +149,13 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch, &this->atom->q, 
-                          &_cut_coulsq, &_qqrd2e, &_g_ewald, 
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                          &_cut_coulsq, &_qqrd2e, &_g_ewald,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
diff --git a/lib/gpu/lal_cg_cmm_long.cu b/lib/gpu/lal_cg_cmm_long.cu
index ae8b6cda47..f6942d1809 100644
--- a/lib/gpu/lal_cg_cmm_long.cu
+++ b/lib/gpu/lal_cg_cmm_long.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -29,12 +29,12 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, 
+__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
                             const __global numtyp4 *restrict lj1,
-                            const __global numtyp4 *restrict lj3, 
-                            const int lj_types, 
+                            const __global numtyp4 *restrict lj3,
+                            const int lj_types,
                             const __global numtyp *restrict sp_lj_in,
-                            const __global int *dev_nbor, 
+                            const __global int *dev_nbor,
                             const __global int *dev_packed,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
@@ -70,7 +70,7 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -136,7 +136,7 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
           if (rsq < lj1[mtype].y) {
             energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
                       lj3[mtype].w;
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -154,17 +154,17 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp4 *restrict lj1_in,
-                                 const __global numtyp4 *restrict lj3_in, 
-                                 const __global numtyp *restrict sp_lj_in, 
-                                 const __global int *dev_nbor, 
+                                 const __global numtyp4 *restrict lj3_in,
+                                 const __global numtyp *restrict sp_lj_in,
+                                 const __global int *dev_nbor,
                                  const __global int *dev_packed,
                                  __global acctyp4 *restrict ans,
-                                 __global acctyp *restrict engv, 
-                                 const int eflag, const int vflag, 
+                                 __global acctyp *restrict engv,
+                                 const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
-                                 const __global numtyp *restrict q_, 
+                                 const __global numtyp *restrict q_,
                                  const numtyp cut_coulsq, const numtyp qqrd2e,
                                  const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
@@ -179,7 +179,7 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
     lj1[tid]=lj1_in[tid];
     lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -187,16 +187,16 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
@@ -262,7 +262,7 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
           if (rsq < lj1[mtype].y) {
             energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
                       lj3[mtype].w;
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_cg_cmm_long.h b/lib/gpu/lal_cg_cmm_long.h
index bde5c79c74..aa0cbfbaf0 100644
--- a/lib/gpu/lal_cg_cmm_long.h
+++ b/lib/gpu/lal_cg_cmm_long.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,8 +40,8 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, int ** cg_type,
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const double qqrd2e, const double g_ewald);
@@ -58,7 +58,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
 
   // --------------------------- TYPE DATA --------------------------
 
-  /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2, 
+  /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2,
   UCL_D_Vec<numtyp4> lj1;
   /// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset
   UCL_D_Vec<numtyp4> lj3;
@@ -68,7 +68,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _cut_coulsq, _qqrd2e, _g_ewald;
diff --git a/lib/gpu/lal_cg_cmm_long_ext.cpp b/lib/gpu/lal_cg_cmm_long_ext.cpp
index 966588bf9b..ee0a0269e5 100644
--- a/lib/gpu/lal_cg_cmm_long_ext.cpp
+++ b/lib/gpu/lal_cg_cmm_long_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -28,9 +28,9 @@ static CGCMMLong<PRECISION,ACC_PRECISION> CMMLMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
+                  double **host_lj1, double **host_lj2, double **host_lj3,
                   double **host_lj4, double **offset, double *special_lj,
-                  const int inum, const int nall, const int max_nbors, 
+                  const int inum, const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size, int &gpu_mode,
                   FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
                   double *host_special_coul, const double qqrd2e,
@@ -58,7 +58,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
   int init_ok=0;
   if (world_me==0)
     init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
-                        host_lj4, offset, special_lj, inum, nall, 300, 
+                        host_lj4, offset, special_lj, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);
 
@@ -82,7 +82,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
                           host_cut_ljsq, host_cut_coulsq, host_special_coul,
                           qqrd2e, g_ewald);
     CMMLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -99,7 +99,7 @@ void cmml_gpu_clear() {
 
 int** cmml_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
@@ -109,8 +109,8 @@ int** cmml_gpu_compute_n(const int ago, const int inum_full,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q,boxlo,prd);
-}  
-			
+}
+
 void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_charmm_long.cpp b/lib/gpu/lal_charmm_long.cpp
index 157072dc22..9cd032b3c6 100644
--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 CHARMMLongT::~CHARMMLong() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,9 +45,9 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int CHARMMLongT::init(const int ntypes,
-                           double host_cut_bothsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
+                           double host_cut_bothsq, double **host_lj1,
+                           double **host_lj2, double **host_lj3,
+                           double **host_lj4, double **host_offset,
                            double *host_special_lj, const int nlocal,
                            const int nall, const int max_nbors,
                            const int maxspecial, const double cell_size,
@@ -144,7 +144,7 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -153,17 +153,17 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj, 
+    this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
                           &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
-                          &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, 
+                          &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->atom->q,
                      &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
diff --git a/lib/gpu/lal_charmm_long.cu b/lib/gpu/lal_charmm_long.cu
index dde50da300..244131f833 100644
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -31,14 +31,14 @@ texture<int2> q_tex;
 
 __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
                             const __global numtyp4 *restrict lj1,
-                            const int lj_types, 
+                            const int lj_types,
                             const __global numtyp *restrict sp_lj,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
                             __global acctyp4 *restrict ans,
-                            __global acctyp *restrict engv, 
-                            const int eflag, const int vflag, const int inum, 
-                            const int nbor_pitch, 
+                            __global acctyp *restrict engv,
+                            const int eflag, const int vflag, const int inum,
+                            const int nbor_pitch,
                             const __global numtyp *restrict q_,
                             const numtyp cut_coulsq, const numtyp qqrd2e,
                             const numtyp g_ewald, const numtyp denom_lj,
@@ -61,7 +61,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -93,7 +93,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
           force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
           if (rsq > cut_lj_innersq) {
             switch1 = (cut_ljsq-rsq);
-            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ 
+            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/
                              denom_lj;
             switch1 *= switch1;
             switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
@@ -130,7 +130,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
             if (rsq > cut_lj_innersq)
               e *= switch1;
             energy+=factor_lj*e;
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -148,19 +148,19 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp2 *restrict ljd_in,
-                                 const __global numtyp *restrict sp_lj_in, 
-                                 const __global int *dev_nbor, 
-                                 const __global int *dev_packed, 
+                                 const __global numtyp *restrict sp_lj_in,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
                                  __global acctyp4 *restrict ans,
-                                 __global acctyp *restrict engv, 
-                                 const int eflag, const int vflag, 
-                                 const int inum, const int nbor_pitch, 
+                                 __global acctyp *restrict engv,
+                                 const int eflag, const int vflag,
+                                 const int inum, const int nbor_pitch,
                                  const __global numtyp *restrict q_,
                                  const numtyp cut_coulsq, const numtyp qqrd2e,
                                  const numtyp g_ewald, const numtyp denom_lj,
-                                 const numtyp cut_bothsq, const numtyp cut_ljsq, 
+                                 const numtyp cut_bothsq, const numtyp cut_ljsq,
                                  const numtyp cut_lj_innersq,
                                  const int t_per_atom) {
   int tid, ii, offset;
@@ -174,7 +174,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
     ljd[tid]=ljd_in[tid];
   if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
     ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -182,16 +182,16 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -229,7 +229,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
           force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
           if (rsq > cut_lj_innersq) {
             switch1 = (cut_ljsq-rsq);
-            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ 
+            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/
                              denom_lj;
             switch1 *= switch1;
             switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
diff --git a/lib/gpu/lal_charmm_long.h b/lib/gpu/lal_charmm_long.h
index 201a5c3694..011083db13 100644
--- a/lib/gpu/lal_charmm_long.h
+++ b/lib/gpu/lal_charmm_long.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,12 +40,12 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
   int init(const int ntypes, double host_cut_bothsq,
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const double qqrd2e, const double g_ewald,
-           const double cut_lj_innersq, const double denom_lj, 
+           const double cut_lj_innersq, const double denom_lj,
            double **epsilon, double **sigma, const bool mix_arithmetic);
 
   /// Clear all host and device data
@@ -70,7 +70,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e, _g_ewald, _denom_lj;
diff --git a/lib/gpu/lal_charmm_long_ext.cpp b/lib/gpu/lal_charmm_long_ext.cpp
index 807988a3e8..e24c650be4 100644
--- a/lib/gpu/lal_charmm_long_ext.cpp
+++ b/lib/gpu/lal_charmm_long_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -87,7 +87,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
                           sigma, mix_arithmetic);
 
     CRMLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -104,7 +104,7 @@ void crml_gpu_clear() {
 
 int** crml_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
@@ -114,14 +114,14 @@ int** crml_gpu_compute_n(const int ago, const int inum_full,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
-}  
-			
+}
+
 void crml_gpu_compute(const int ago, const int inum_full,
-	 	                  const int nall, double **host_x, int *host_type,
+                                   const int nall, double **host_x, int *host_type,
                       int *ilist, int *numj, int **firstneigh,
-		                  const bool eflag, const bool vflag, const bool eatom,
+                                  const bool eflag, const bool vflag, const bool eatom,
                       const bool vatom, int &host_start, const double cpu_time,
-                      bool &success, double *host_q, const int nlocal, 
+                      bool &success, double *host_q, const int nlocal,
                       double *boxlo, double *prd) {
   CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
                  eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
diff --git a/lib/gpu/lal_colloid.cpp b/lib/gpu/lal_colloid.cpp
index 28045217d3..fb2b643e5e 100644
--- a/lib/gpu/lal_colloid.cpp
+++ b/lib/gpu/lal_colloid.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,23 +33,23 @@ ColloidT::Colloid() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-ColloidT::~Colloid() { 
+ColloidT::~Colloid() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int ColloidT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int ColloidT::init(const int ntypes, 
-                   double **host_cutsq, double **host_lj1, 
-                   double **host_lj2, double **host_lj3, 
-                   double **host_lj4, double **host_offset, 
-                   double *host_special_lj, double **host_a12, 
-                   double **host_a1, double **host_a2, 
-                   double **host_d1, double **host_d2, 
+int ColloidT::init(const int ntypes,
+                   double **host_cutsq, double **host_lj1,
+                   double **host_lj2, double **host_lj3,
+                   double **host_lj4, double **host_offset,
+                   double *host_special_lj, double **host_a12,
+                   double **host_a1, double **host_a2,
+                   double **host_d1, double **host_d2,
                    double **host_sigma3, double **host_sigma6,
                    int **host_form, const int nlocal,
                    const int nall, const int max_nbors,
@@ -97,7 +97,7 @@ int ColloidT::init(const int ntypes,
   UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device),
                              UCL_WRITE_ONLY);
   for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0;
-                                
+
   form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<ntypes; i++)
     for (int j=0; j<ntypes; j++) {
@@ -153,7 +153,7 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -170,9 +170,9 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
-                     &colloid1, &colloid2, &form, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, &eflag, &vflag, 
+                     &colloid1, &colloid2, &form,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_colloid.cu b/lib/gpu/lal_colloid.cu
index a4d6c8bf33..28a9809b19 100644
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -24,18 +24,18 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_colloid(const __global numtyp4 *restrict x_, 
+__kernel void k_colloid(const __global numtyp4 *restrict x_,
                         const __global numtyp4 *restrict lj1,
-                        const __global numtyp4 *restrict lj3, 
-                        const int lj_types, 
-                        const __global numtyp *restrict sp_lj_in, 
-                        const __global numtyp4 *restrict colloid1, 
+                        const __global numtyp4 *restrict lj3,
+                        const int lj_types,
+                        const __global numtyp *restrict sp_lj_in,
+                        const __global numtyp4 *restrict colloid1,
                         const __global numtyp4 *restrict colloid2,
-                        const __global int *form, 
-                        const __global int *dev_nbor, 
-                        const __global int *dev_packed, 
+                        const __global int *form,
+                        const __global int *dev_nbor,
+                        const __global int *dev_packed,
                         __global acctyp4 *restrict ans,
-                        __global acctyp *restrict engv, 
+                        __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -53,20 +53,20 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -79,21 +79,21 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
-      if (rsq<lj1[mtype].z) {   
+      if (rsq<lj1[mtype].z) {
         numtyp r,r2inv,r6inv;
         numtyp c1,c2,fR,evdwl;
         numtyp K[9],h[4],g[4];
         numtyp force = (numtyp)0;
- 
+
         if (form[mtype]==0) { // SMALL_SMALL
           r2inv=ucl_recip(rsq);
           r6inv = r2inv*r2inv*r2inv;
           force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
           force*=factor_lj;
         } else if (form[mtype]==1) { // SMALL_LARGE
-          c2 = colloid1[mtype].z; 
+          c2 = colloid1[mtype].z;
           K[1] = c2*c2;
           K[2] = rsq;
           K[0] = K[1] - rsq;
@@ -102,15 +102,15 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
           K[3] *= K[3]*K[3];
           K[6] = K[3]*K[3];
           fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
-          force = (numtyp)4.0/(numtyp)15.0*fR * 
-             ((numtyp)2.0*(K[1]+K[2]) * 
-             (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) * 
+          force = (numtyp)4.0/(numtyp)15.0*fR *
+             ((numtyp)2.0*(K[1]+K[2]) *
+             (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
              colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
           force*=factor_lj;
         } else if (form[mtype]==2) { // LARGE_LARGE
           r = ucl_sqrt(rsq);
-          c1 = colloid1[mtype].y; 
-          c2 = colloid1[mtype].z; 
+          c1 = colloid1[mtype].y;
+          c2 = colloid1[mtype].z;
           K[0] = c1*c2;
           K[1] = c1+c2;
           K[2] = c1-c2;
@@ -132,16 +132,16 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
           g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4];
           g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5];
           g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6];
-	
+
           fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
           evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
           numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
           numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*
-                       (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
+                       (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
                        ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
           force = factor_lj * (dUR+dUA)/r;
         }
-  
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -151,14 +151,14 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
           if (form[mtype]==0) {
             e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           } else if (form[mtype]==1) {
-            e=(numtyp)2.0/(numtyp)9.0*fR * 
-              ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) +  
+            e=(numtyp)2.0/(numtyp)9.0*fR *
+              ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) +
               (numtyp)4.2*K[4])+K[2]*K[4]) * colloid2[mtype].w/K[6]);
           } else if (form[mtype]==2) {
-            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * 
+            e=evdwl+colloid1[mtype].x/(numtyp)6.0 *
               ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
-          } 
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          }
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -176,22 +176,22 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_colloid_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict lj1_in,
-                             const __global numtyp4 *restrict lj3_in, 
+                             const __global numtyp4 *restrict lj3_in,
                              const __global numtyp *restrict sp_lj_in,
-                             const __global numtyp4 *restrict colloid1_in, 
+                             const __global numtyp4 *restrict colloid1_in,
                              const __global numtyp4 *restrict colloid2_in,
-                             const __global int *form_in, 
-                             const __global int *dev_nbor, 
-                             const __global int *dev_packed, 
-                             __global acctyp4 *restrict ans, 
-                             __global acctyp *restrict engv, 
-                             const int eflag, const int vflag, const int inum, 
+                             const __global int *form_in,
+                             const __global int *dev_nbor,
+                             const __global int *dev_packed,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv,
+                             const int eflag, const int vflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 colloid1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@@ -208,7 +208,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -217,7 +217,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -231,7 +231,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -244,20 +244,20 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<lj1[mtype].z) {
         numtyp r,r2inv,r6inv;
         numtyp c1,c2,fR,evdwl;
         numtyp K[9],h[4],g[4];
         numtyp force = (numtyp)0;
- 
+
         if (form[mtype]==0) { // SMALL_SMALL
           r2inv=ucl_recip(rsq);
           r6inv = r2inv*r2inv*r2inv;
           force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
           force*=factor_lj;
         } else if (form[mtype]==1) { // SMALL_LARGE
-          c2 = colloid1[mtype].z; 
+          c2 = colloid1[mtype].z;
           K[1] = c2*c2;
           K[2] = rsq;
           K[0] = K[1] - rsq;
@@ -266,15 +266,15 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
           K[3] *= K[3]*K[3];
           K[6] = K[3]*K[3];
           fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
-          force = (numtyp)4.0/(numtyp)15.0*fR * 
-            ((numtyp)2.0*(K[1]+K[2]) * 
-            (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) * 
+          force = (numtyp)4.0/(numtyp)15.0*fR *
+            ((numtyp)2.0*(K[1]+K[2]) *
+            (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
             colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
           force*=factor_lj;
         } else if (form[mtype]==2) { // LARGE_LARGE
           r = ucl_sqrt(rsq);
-          c1 = colloid1[mtype].y; 
-          c2 = colloid1[mtype].z; 
+          c1 = colloid1[mtype].y;
+          c2 = colloid1[mtype].z;
           K[0] = c1*c2;
           K[1] = c1+c2;
           K[2] = c1-c2;
@@ -296,16 +296,16 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
           g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4];
           g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5];
           g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6];
-	
+
           fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
           evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
           numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
           numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*
-            (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
+            (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
             ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
           force = factor_lj * (dUR+dUA)/r;
         } else force = (numtyp)0.0;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -315,15 +315,15 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
           if (form[mtype]==0) {
             e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           } else if (form[mtype]==1) {
-            e=(numtyp)2.0/(numtyp)9.0*fR * 
+            e=(numtyp)2.0/(numtyp)9.0*fR *
               ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+
               (numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])*
               colloid2[mtype].w/K[6]);
           } else if (form[mtype]==2) {
-            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * 
+            e=evdwl+colloid1[mtype].x/(numtyp)6.0 *
               ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
-          } 
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          }
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_colloid.h b/lib/gpu/lal_colloid.h
index 416beabcdf..dfbd4dbadd 100644
--- a/lib/gpu/lal_colloid.h
+++ b/lib/gpu/lal_colloid.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Colloid : public BaseAtomic<numtyp, acctyp> {
  public:
   Colloid();
-  ~Colloid(); 
+  ~Colloid();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,11 +40,11 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq,
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double **host_offset, double *host_special_lj,
-           double **host_a12, double **host_a1, double **host_a2, 
-           double **host_d1, double **host_d2, double **host_sigma3, 
-           double **host_sigma6, int **host_form, 
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           double **host_a12, double **host_a1, double **host_a2,
+           double **host_d1, double **host_d2, double **host_sigma3,
+           double **host_sigma6, int **host_form,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
@@ -65,7 +65,7 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
   UCL_D_Vec<numtyp4> lj3;
   /// colloid1.x = a12, colloid1.y = a1, colloid1.z = a2
   UCL_D_Vec<numtyp4> colloid1;
-  /// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3, 
+  /// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3,
   /// colloid2.w = sigma6
   UCL_D_Vec<numtyp4> colloid2;
   /// form
@@ -76,7 +76,7 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_colloid_ext.cpp b/lib/gpu/lal_colloid_ext.cpp
index ea83cb6417..8e1b18e72f 100644
--- a/lib/gpu/lal_colloid_ext.cpp
+++ b/lib/gpu/lal_colloid_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -29,9 +29,9 @@ static Colloid<PRECISION,ACC_PRECISION> COLLMF;
 // ---------------------------------------------------------------------------
 int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                      double **host_lj2, double **host_lj3, double **host_lj4,
-                     double **offset, double *special_lj, 
-                     double **host_a12, double **host_a1, double **host_a2, 
-                     double **host_d1, double **host_d2, double **host_sigma3, 
+                     double **offset, double *special_lj,
+                     double **host_a12, double **host_a1, double **host_a2,
+                     double **host_d1, double **host_d2, double **host_sigma3,
                      double **host_sigma6, int **host_form, const int inum,
                      const int nall, const int max_nbors,  const int maxspecial,
                      const double cell_size, int &gpu_mode, FILE *screen) {
@@ -57,9 +57,9 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+    init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                         host_lj4, offset, special_lj, host_a12, host_a1,
-                        host_a2, host_d1, host_d2, host_sigma3, 
+                        host_a2, host_d1, host_d2, host_sigma3,
                         host_sigma6, host_form, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen);
 
@@ -78,13 +78,13 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                          offset, special_lj, host_a12, host_a1, host_a2, 
-                          host_d1, host_d2, host_sigma3, host_sigma6, host_form, 
+                          offset, special_lj, host_a12, host_a1, host_a2,
+                          host_d1, host_d2, host_sigma3, host_sigma6, host_form,
                           inum, nall, 300, maxspecial,
                           cell_size, gpu_split, screen);
 
     COLLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -109,8 +109,8 @@ int ** colloid_gpu_compute_n(const int ago, const int inum_full,
   return COLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void colloid_gpu_compute(const int ago, const int inum_full, const int nall,
                          double **host_x, int *host_type, int *ilist, int *numj,
                          int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_coul.cpp b/lib/gpu/lal_coul.cpp
index 53fb3dae82..a06a29e610 100644
--- a/lib/gpu/lal_coul.cpp
+++ b/lib/gpu/lal_coul.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndtrung@umich.edu
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 CoulT::~Coul() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CoulT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -75,7 +75,7 @@ int CoulT::init(const int ntypes, double **host_scale, double **host_cutsq,
 
   scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale);
-  
+
   cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
 
@@ -97,10 +97,10 @@ void CoulT::reinit(const int ntypes, double **host_scale) {
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack1(ntypes,_lj_types,scale,host_write,host_scale);
 }
 
@@ -138,7 +138,7 @@ void CoulT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -149,14 +149,14 @@ void CoulT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
                           &cutsq, &_qqrd2e, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl,
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv,
                      &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
                      &cutsq, &_qqrd2e, &this->_threads_per_atom);
   }
diff --git a/lib/gpu/lal_coul.cu b/lib/gpu/lal_coul.cu
index e955922a7c..503e674c81 100644
--- a/lib/gpu/lal_coul.cu
+++ b/lib/gpu/lal_coul.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : ndtrung@umich.edu
 // ***************************************************************************/
 
@@ -33,14 +33,14 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
                      const __global numtyp *restrict scale,
                      const int lj_types,
                      const __global numtyp *restrict sp_cl_in,
-                     const __global int *dev_nbor, 
-                     const __global int *dev_packed, 
+                     const __global int *dev_nbor,
+                     const __global int *dev_packed,
                      __global acctyp4 *restrict ans,
-                     __global acctyp *restrict engv, 
+                     __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
-                     const int nbor_pitch, 
-                     const __global numtyp *restrict q_, 
-                     const __global numtyp *restrict cutsq, 
+                     const int nbor_pitch,
+                     const __global numtyp *restrict q_,
+                     const __global numtyp *restrict cutsq,
                      const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -50,7 +50,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
   sp_cl[1]=sp_cl_in[1];
   sp_cl[2]=sp_cl_in[2];
   sp_cl[3]=sp_cl_in[3];
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -58,13 +58,13 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -120,14 +120,14 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
 __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict scale,
                           const __global numtyp *restrict sp_cl_in,
-                          const __global int *dev_nbor, 
+                          const __global int *dev_nbor,
                           const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nbor_pitch, 
+                          __global acctyp *restrict engv,
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch,
                           const __global numtyp *restrict q_,
-                          const __global numtyp *restrict _cutsq, 
+                          const __global numtyp *restrict _cutsq,
                           const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -139,7 +139,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=_cutsq[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -147,15 +147,15 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
diff --git a/lib/gpu/lal_coul.h b/lib/gpu/lal_coul.h
index 4374abd80d..6d9b6b1b2b 100644
--- a/lib/gpu/lal_coul.h
+++ b/lib/gpu/lal_coul.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndtrung@umich.edu
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class Coul : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -39,13 +39,13 @@ class Coul : public BaseCharge<numtyp, acctyp> {
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_scale,
            double **host_cutsq, double *host_special_coul,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, const double qqrd2e);
-  
+
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **host_scale);
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -68,7 +68,7 @@ class Coul : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e;
diff --git a/lib/gpu/lal_coul_debye.cpp b/lib/gpu/lal_coul_debye.cpp
index 990dff6db9..9098aeacb1 100644
--- a/lib/gpu/lal_coul_debye.cpp
+++ b/lib/gpu/lal_coul_debye.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndtrung@umich.edu
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 CoulDebyeT::~CoulDebye() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CoulDebyeT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -87,7 +87,7 @@ int CoulDebyeT::init(const int ntypes, double **host_scale,
 
   _qqrd2e=qqrd2e;
   _kappa=kappa;
-  
+
   _allocated=true;
   this->_max_bytes=cutsq.row_bytes()+scale.row_bytes()+sp_cl.row_bytes();
   return 0;
@@ -98,10 +98,10 @@ void CoulDebyeT::reinit(const int ntypes, double **host_scale) {
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack1(ntypes,_lj_types,scale,host_write,host_scale);
 }
 
@@ -139,7 +139,7 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -156,9 +156,9 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl,
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
-                     &ainum, &nbor_pitch, &this->atom->q, &cutsq, 
+                     &ainum, &nbor_pitch, &this->atom->q, &cutsq,
                      &_qqrd2e, &_kappa, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_coul_debye.cu b/lib/gpu/lal_coul_debye.cu
index 0e4c0ea2d0..464a1b18de 100644
--- a/lib/gpu/lal_coul_debye.cu
+++ b/lib/gpu/lal_coul_debye.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : ndtrung@umich.edu
 // ***************************************************************************/
 
@@ -31,16 +31,16 @@ texture<int2> q_tex;
 
 __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict scale,
-                           const int lj_types, 
+                           const int lj_types,
                            const __global numtyp *restrict sp_cl_in,
-                           const __global int *dev_nbor, 
-                           const __global int *dev_packed, 
+                           const __global int *dev_nbor,
+                           const __global int *dev_packed,
                            __global acctyp4 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag, const int inum,
                            const int nbor_pitch,
                            const __global numtyp *restrict q_ ,
-                           const __global numtyp *restrict cutsq, 
+                           const __global numtyp *restrict cutsq,
                            const numtyp qqrd2e, const numtyp kappa,
                            const int t_per_atom) {
   int tid, ii, offset;
@@ -59,27 +59,27 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
     numtyp factor_coul;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_coul = sp_cl[sbmask(j)];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
@@ -146,7 +146,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
     scale[tid]=scale_in[tid];
     cutsq[tid]=_cutsq[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -154,15 +154,15 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
diff --git a/lib/gpu/lal_coul_debye.h b/lib/gpu/lal_coul_debye.h
index 885f08cd34..328c3dd64e 100644
--- a/lib/gpu/lal_coul_debye.h
+++ b/lib/gpu/lal_coul_debye.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndtrung@umich.edu
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -39,14 +39,14 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_scale,
            double **host_cutsq, double *host_special_coul,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen,
            const double qqrd2e, const double kappa);
-  
+
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **host_scale);
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -69,7 +69,7 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e,_kappa;
diff --git a/lib/gpu/lal_coul_debye_ext.cpp b/lib/gpu/lal_coul_debye_ext.cpp
index ced08b63e4..af9156c24c 100644
--- a/lib/gpu/lal_coul_debye_ext.cpp
+++ b/lib/gpu/lal_coul_debye_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndtrung@umich.edu
  ***************************************************************************/
 
@@ -75,7 +75,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq,
                          maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa);
 
     CDEMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -93,16 +93,16 @@ void cdebye_gpu_reinit(const int ntypes, double **host_scale) {
   int world_me=CDEMF.device->world_me();
   int gpu_rank=CDEMF.device->gpu_rank();
   int procs_per_gpu=CDEMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     CDEMF.reinit(ntypes, host_scale);
-  
+
   CDEMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       CDEMF.reinit(ntypes, host_scale);
-    
+
     CDEMF.device->gpu_barrier();
   }
 }
@@ -123,8 +123,8 @@ int** cdebye_gpu_compute_n(const int ago, const int inum_full,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
-}  
-			
+}
+
 void cdebye_gpu_compute(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, int *ilist, int *numj,
                         int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_coul_dsf.cpp b/lib/gpu/lal_coul_dsf.cpp
index ca81d32b2d..32c4342fbe 100644
--- a/lib/gpu/lal_coul_dsf.cpp
+++ b/lib/gpu/lal_coul_dsf.cpp
@@ -37,18 +37,18 @@ template <class numtyp, class acctyp>
 CoulDSFT::~CoulDSF() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CoulDSFT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int CoulDSFT::init(const int ntypes, const int nlocal, const int nall, 
-                   const int max_nbors, const int maxspecial, 
+int CoulDSFT::init(const int ntypes, const int nlocal, const int nall,
+                   const int max_nbors, const int maxspecial,
                    const double cell_size, const double gpu_split, FILE *_screen,
                    const double host_cut_coulsq, double *host_special_coul,
-                   const double qqrd2e, const double e_shift, const double f_shift, 
+                   const double qqrd2e, const double e_shift, const double f_shift,
                    const double alpha) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@@ -123,7 +123,7 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -134,15 +134,15 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->x, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
                           &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, 
+    this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv,
                      &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
                      &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
                      &this->_threads_per_atom);
diff --git a/lib/gpu/lal_coul_dsf.cu b/lib/gpu/lal_coul_dsf.cu
index fc5bf5f138..82c44cd382 100644
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@@ -31,18 +31,18 @@ texture<int2> q_tex;
 
 #define MY_PIS (acctyp)1.77245385090551602729
 
-__kernel void k_coul_dsf(const __global numtyp4 *restrict x_, 
-                         const int lj_types, 
-                         const __global numtyp *restrict sp_lj_in, 
-                         const __global int *dev_nbor, 
-                         const __global int *dev_packed, 
+__kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
+                         const int lj_types,
+                         const __global numtyp *restrict sp_lj_in,
+                         const __global int *dev_nbor,
+                         const __global int *dev_packed,
                          __global acctyp4 *restrict ans,
-                         __global acctyp *restrict engv, 
+                         __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
-                         const int nbor_pitch, 
+                         const int nbor_pitch,
                          const __global numtyp *restrict q_ ,
                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                         const numtyp e_shift, const numtyp f_shift, 
+                         const numtyp e_shift, const numtyp f_shift,
                          const numtyp alpha, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -60,19 +60,19 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
 
     if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * 
+      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
     }
@@ -102,9 +102,9 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
         numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
         numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
         erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
-        forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + 
+        forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd +
           rsq*f_shift-factor_coul);
-        
+
         force = forcecoul * r2inv;
 
         f.x+=delx*force;
@@ -131,17 +131,17 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
-                              const __global int *dev_nbor, 
+                              const __global int *dev_nbor,
                               const __global int *dev_packed,
-                              __global acctyp4 *restrict ans, 
-                              __global acctyp *restrict engv, 
-                              const int eflag, const int vflag, const int inum, 
-                              const int nbor_pitch, 
+                              __global acctyp4 *restrict ans,
+                              __global acctyp *restrict engv,
+                              const int eflag, const int vflag, const int inum,
+                              const int nbor_pitch,
                               const __global numtyp *restrict q_,
                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                              const numtyp e_shift, const numtyp f_shift, 
+                              const numtyp e_shift, const numtyp f_shift,
                               const numtyp alpha, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -149,7 +149,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
   __local numtyp sp_lj[4];
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -157,25 +157,25 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
 
     if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * 
+      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
     }
- 
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
 
@@ -201,9 +201,9 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
         numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
         numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
         erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
-        forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + 
+        forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd +
           rsq*f_shift-factor_coul);
-        
+
         force = forcecoul * r2inv;
 
         f.x+=delx*force;
diff --git a/lib/gpu/lal_coul_dsf.h b/lib/gpu/lal_coul_dsf.h
index 0c5b063026..e52a51d583 100644
--- a/lib/gpu/lal_coul_dsf.h
+++ b/lib/gpu/lal_coul_dsf.h
@@ -30,18 +30,18 @@ class CoulDSF : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, const int nlocal, const int nall, 
-           const int max_nbors, const int maxspecial, 
+  int init(const int ntypes, const int nlocal, const int nall,
+           const int max_nbors, const int maxspecial,
            const double cell_size, const double gpu_split, FILE *screen,
            const double host_cut_coulsq, double *host_special_coul,
-           const double qqrd2e, const double e_shift, const double f_shift, 
+           const double qqrd2e, const double e_shift, const double f_shift,
            const double alpha);
 
   /// Clear all host and device data
@@ -62,7 +62,7 @@ class CoulDSF : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e;
diff --git a/lib/gpu/lal_coul_dsf_ext.cpp b/lib/gpu/lal_coul_dsf_ext.cpp
index e65a090a16..026dd924c9 100644
--- a/lib/gpu/lal_coul_dsf_ext.cpp
+++ b/lib/gpu/lal_coul_dsf_ext.cpp
@@ -27,11 +27,11 @@ static CoulDSF<PRECISION,ACC_PRECISION> CDMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int cdsf_gpu_init(const int ntypes, const int inum, const int nall, 
+int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
                   const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen,
-                  const double host_cut_coulsq, double *host_special_coul, 
-                  const double qqrd2e, const double e_shift, const double f_shift, 
+                  const double host_cut_coulsq, double *host_special_coul,
+                  const double qqrd2e, const double e_shift, const double f_shift,
                   const double alpha) {
   CDMF.clear();
   gpu_mode=CDMF.device->gpu_mode();
@@ -55,8 +55,8 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, 
-                      gpu_split, screen, host_cut_coulsq, host_special_coul, 
+    init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size,
+                      gpu_split, screen, host_cut_coulsq, host_special_coul,
                       qqrd2e, e_shift, f_shift, alpha);
 
   CDMF.device->world_barrier();
@@ -73,12 +73,12 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, 
-                        gpu_split, screen, host_cut_coulsq, host_special_coul, 
+      init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size,
+                        gpu_split, screen, host_cut_coulsq, host_special_coul,
                         qqrd2e, e_shift, f_shift, alpha);
 
     CDMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -95,7 +95,7 @@ void cdsf_gpu_clear() {
 
 int** cdsf_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
@@ -105,8 +105,8 @@ int** cdsf_gpu_compute_n(const int ago, const int inum_full,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success,
                       host_q, boxlo, prd);
-}  
-			
+}
+
 void cdsf_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_coul_ext.cpp b/lib/gpu/lal_coul_ext.cpp
index 291546d5b1..f03d8fcdfc 100644
--- a/lib/gpu/lal_coul_ext.cpp
+++ b/lib/gpu/lal_coul_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndtrung@umich.edu
  ***************************************************************************/
 
@@ -75,7 +75,7 @@ int coul_gpu_init(const int ntypes, double **host_scale,
                           maxspecial, cell_size, gpu_split, screen, qqrd2e);
 
     COULMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -93,16 +93,16 @@ void coul_gpu_reinit(const int ntypes, double **host_scale) {
   int world_me=COULMF.device->world_me();
   int gpu_rank=COULMF.device->gpu_rank();
   int procs_per_gpu=COULMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     COULMF.reinit(ntypes, host_scale);
-  
+
   COULMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       COULMF.reinit(ntypes, host_scale);
-    
+
     COULMF.device->gpu_barrier();
   }
 }
@@ -113,7 +113,7 @@ void coul_gpu_clear() {
 
 int** coul_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                        double *sublo, double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
@@ -123,8 +123,8 @@ int** coul_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
-}  
-			
+}
+
 void coul_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_coul_long.cpp b/lib/gpu/lal_coul_long.cpp
index d6e16a9668..b4c6a44d2f 100644
--- a/lib/gpu/lal_coul_long.cpp
+++ b/lib/gpu/lal_coul_long.cpp
@@ -36,7 +36,7 @@ template <class numtyp, class acctyp>
 CoulLongT::~CoulLong() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CoulLongT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -51,7 +51,7 @@ int CoulLongT::init(const int ntypes, double **host_scale,
                     const double qqrd2e, const double g_ewald) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
-			                      gpu_split,_screen,coul_long,"k_coul_long");
+                                              gpu_split,_screen,coul_long,"k_coul_long");
   if (success!=0)
     return success;
 
@@ -67,13 +67,13 @@ int CoulLongT::init(const int ntypes, double **host_scale,
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<lj_types*lj_types; i++)
     host_write[i]=0.0;
 
   scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale);
-  
+
   sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
     host_write[i]=host_special_coul[i];
@@ -129,7 +129,7 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -141,13 +141,13 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv,
-                          &eflag, &vflag, &ainum, &nbor_pitch, 
+                          &eflag, &vflag, &ainum, &nbor_pitch,
                           &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl,
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu
index 12bbbee7d2..365195e00c 100644
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@@ -123,16 +123,16 @@ texture<int2> q_tex;
 
 #endif
 
-__kernel void k_coul_long(const __global numtyp4 *restrict x_, 
+__kernel void k_coul_long(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict scale,
                           const int lj_types,
-                          const __global numtyp *restrict sp_cl_in, 
+                          const __global numtyp *restrict sp_cl_in,
                           const __global int *dev_nbor,
-                          const __global int *dev_packed, 
+                          const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
+                          __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
-                          const int nbor_pitch, 
+                          const int nbor_pitch,
                           const __global numtyp *restrict q_,
                           const numtyp cut_coulsq, const numtyp qqrd2e,
                           const numtyp g_ewald, const int t_per_atom) {
@@ -216,15 +216,15 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict scale_in,
                                const __global numtyp *restrict sp_cl_in,
-                               const __global int *dev_nbor, 
+                               const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans, 
+                               __global acctyp4 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
-                               const int nbor_pitch, 
+                               const int nbor_pitch,
                                const __global numtyp *restrict q_,
                                const numtyp cut_coulsq, const numtyp qqrd2e,
                                const numtyp g_ewald, const int t_per_atom) {
diff --git a/lib/gpu/lal_coul_long.h b/lib/gpu/lal_coul_long.h
index 52ed60111b..d12198fccc 100644
--- a/lib/gpu/lal_coul_long.h
+++ b/lib/gpu/lal_coul_long.h
@@ -30,7 +30,7 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,13 +40,13 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {
   int init(const int ntypes, double **scale,
            const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
-	         const double gpu_split, FILE *screen,
-	         const double host_cut_coulsq, double *host_special_coul,
-	         const double qqrd2e, const double g_ewald);
-  
+                 const double gpu_split, FILE *screen,
+                 const double host_cut_coulsq, double *host_special_coul,
+                 const double qqrd2e, const double g_ewald);
+
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **scale);
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
diff --git a/lib/gpu/lal_coul_long_ext.cpp b/lib/gpu/lal_coul_long_ext.cpp
index 5552dc2437..06c102b2d1 100644
--- a/lib/gpu/lal_coul_long_ext.cpp
+++ b/lib/gpu/lal_coul_long_ext.cpp
@@ -95,16 +95,16 @@ void cl_gpu_reinit(const int ntypes, double **host_scale) {
   int world_me=CLMF.device->world_me();
   int gpu_rank=CLMF.device->gpu_rank();
   int procs_per_gpu=CLMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     CLMF.reinit(ntypes, host_scale);
-  
+
   CLMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       CLMF.reinit(ntypes, host_scale);
-    
+
     CLMF.device->gpu_barrier();
   }
 }
@@ -114,28 +114,28 @@ void cl_gpu_clear() {
 }
 
 int** cl_gpu_compute_n(const int ago, const int inum_full,
-		       const int nall, double **host_x, int *host_type,
-		       double *sublo, double *subhi, tagint *tag, int **nspecial,
-		       tagint **special, const bool eflag, const bool vflag,
-		       const bool eatom, const bool vatom, int &host_start,
-		       int **ilist, int **jnum,  const double cpu_time,
-		       bool &success, double *host_q, double *boxlo,
-		       double *prd) {
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       int **ilist, int **jnum,  const double cpu_time,
+                       bool &success, double *host_q, double *boxlo,
+                       double *prd) {
   return CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-		      subhi, tag, nspecial, special, eflag, vflag, eatom,
-		      vatom, host_start, ilist, jnum, cpu_time, success,
-		      host_q, boxlo, prd);
+                      subhi, tag, nspecial, special, eflag, vflag, eatom,
+                      vatom, host_start, ilist, jnum, cpu_time, success,
+                      host_q, boxlo, prd);
 }
 
 void cl_gpu_compute(const int ago, const int inum_full, const int nall,
-		    double **host_x, int *host_type, int *ilist, int *numj,
-		    int **firstneigh, const bool eflag, const bool vflag,
-		    const bool eatom, const bool vatom, int &host_start,
-		    const double cpu_time, bool &success, double *host_q,
-		    const int nlocal, double *boxlo, double *prd) {
+                    double **host_x, int *host_type, int *ilist, int *numj,
+                    int **firstneigh, const bool eflag, const bool vflag,
+                    const bool eatom, const bool vatom, int &host_start,
+                    const double cpu_time, bool &success, double *host_q,
+                    const int nlocal, double *boxlo, double *prd) {
   CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
-	       firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-	       host_q,nlocal,boxlo,prd);
+               firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+               host_q,nlocal,boxlo,prd);
 }
 
 double cl_gpu_bytes() {
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index f326657e31..e95f2b30ef 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -45,8 +45,8 @@ DeviceT::~Device() {
 
 template <class numtyp, class acctyp>
 int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
-                         const int last_gpu, const int gpu_mode, 
-                         const double p_split, const int nthreads, 
+                         const int last_gpu, const int gpu_mode,
+                         const double p_split, const int nthreads,
                          const int t_per_atom, const double cell_size,
                          char *ocl_vendor, const int block_pair) {
   _nthreads=nthreads;
@@ -83,8 +83,8 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
   MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names,
                 MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world);
   std::string node_string=std::string(node_name);
-  
-  // Get the number of procs per node                
+
+  // Get the number of procs per node
   std::map<std::string,int> name_map;
   std::map<std::string,int>::iterator np;
   for (int i=0; i<_world_size; i++) {
@@ -104,12 +104,12 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
       split_id=split_num;
     split_num++;
   }
-  
+
   // Set up a per node communicator and find rank within
   MPI_Comm node_comm;
-  MPI_Comm_split(_comm_world, split_id, 0, &node_comm);  
+  MPI_Comm_split(_comm_world, split_id, 0, &node_comm);
   int node_rank;
-  MPI_Comm_rank(node_comm,&node_rank);                  
+  MPI_Comm_rank(node_comm,&node_rank);
 
   // set the device ID
   _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
@@ -120,7 +120,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
   _time_device=true;
   if (_procs_per_gpu>1)
     _time_device=false;
-  
+
   // Set up a per device communicator
   MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
   MPI_Comm_rank(_comm_gpu,&_gpu_rank);
@@ -128,12 +128,12 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
   gpu=new UCL_Device();
   if (my_gpu>=gpu->num_devices())
     return -2;
-    
+
   #ifndef CUDA_PROXY
   if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
     return -7;
   #endif
-  
+
   if (gpu->set(my_gpu)!=UCL_SUCCESS)
     return -6;
 
@@ -144,7 +144,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
 
   if (set_ocl_params(ocl_vendor)!=0)
     return -11;
-  
+
   int flag=0;
   for (int i=0; i<_procs_per_gpu; i++) {
     if (_gpu_rank==i)
@@ -162,7 +162,7 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
     s_vendor=ocl_vendor;
   if (s_vendor=="none")
     s_vendor="generic";
-  
+
   if (s_vendor=="kepler") {
     _ocl_vendor_name="NVIDIA Kepler";
     #if defined (__APPLE__) || defined(MACOSX)
@@ -170,19 +170,19 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
     #else
     _ocl_vendor_string="-DKEPLER_OCL";
     #endif
-  } else if (s_vendor=="fermi") {    
+  } else if (s_vendor=="fermi") {
     _ocl_vendor_name="NVIDIA Fermi";
     _ocl_vendor_string="-DFERMI_OCL";
-  } else if (s_vendor=="cypress") {    
+  } else if (s_vendor=="cypress") {
     _ocl_vendor_name="AMD Cypress";
     _ocl_vendor_string="-DCYPRESS_OCL";
-  } else if (s_vendor=="phi") {    
+  } else if (s_vendor=="phi") {
     _ocl_vendor_name="Intel Phi";
     _ocl_vendor_string="-DPHI_OCL";
-  } else if (s_vendor=="intel") {    
+  } else if (s_vendor=="intel") {
     _ocl_vendor_name="Intel CPU";
     _ocl_vendor_string="-DINTEL_OCL";
-  } else if (s_vendor=="generic") {    
+  } else if (s_vendor=="generic") {
     _ocl_vendor_name="GENERIC";
     _ocl_vendor_string="-DGENERIC_OCL";
   } else {
@@ -220,10 +220,10 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
 
 template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
-                  const bool rot, const int nlocal, 
+                  const bool rot, const int nlocal,
                   const int host_nlocal, const int nall,
                   Neighbor *nbor, const int maxspecial,
-                  const int gpu_host, const int max_nbors, 
+                  const int gpu_host, const int max_nbors,
                   const double cell_size, const bool pre_cut,
                   const int threads_per_atom, const bool vel) {
   if (!_device_init)
@@ -254,7 +254,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
     // Initialize atom and nbor data
     if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel))
       return -3;
-      
+
     _data_in_estimate++;
     if (charge)
       _data_in_estimate++;
@@ -272,12 +272,12 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
     if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel))
       return -3;
   }
-  
+
   if (!ans.init(ef_nlocal,charge,rot,*gpu))
     return -3;
 
   if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
-                  *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, 
+                  *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
                   _block_cell_id, _block_nbor_build, threads_per_atom,
                   _warp_size, _time_device, compile_string()))
     return -3;
@@ -294,7 +294,7 @@ template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
                          const int nall) {
   if (!_device_init)
-    return -1;                          
+    return -1;
   if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
     return -5;
 
@@ -361,7 +361,7 @@ void DeviceT::init_message(FILE *screen, const char *name,
       if (i==first_gpu)
         sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+
               toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+" GHZ (";
-      else              
+      else
         sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+
               toa(gpu->clock_rate(i))+" GHZ (";
       if (sizeof(PRECISION)==4) {
@@ -381,7 +381,7 @@ void DeviceT::init_message(FILE *screen, const char *name,
 }
 
 template <class numtyp, class acctyp>
-void DeviceT::estimate_gpu_overhead(const int kernel_calls, 
+void DeviceT::estimate_gpu_overhead(const int kernel_calls,
                                     double &gpu_overhead,
                                     double &gpu_driver_overhead) {
   UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
@@ -394,38 +394,38 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
     dev_data_in=new UCL_D_Vec<int>[_data_in_estimate];
     timers_in=new UCL_Timer[_data_in_estimate];
   }
-  
+
   if (_data_out_estimate>0) {
     host_data_out=new UCL_H_Vec<int>[_data_out_estimate];
     dev_data_out=new UCL_D_Vec<int>[_data_out_estimate];
     timers_out=new UCL_Timer[_data_out_estimate];
   }
-  
+
   if (kernel_calls>0) {
     kernel_data=new UCL_D_Vec<int>[kernel_calls];
     timers_kernel=new UCL_Timer[kernel_calls];
   }
-  
+
   for (int i=0; i<_data_in_estimate; i++) {
     host_data_in[i].alloc(1,*gpu);
     dev_data_in[i].alloc(1,*gpu);
     timers_in[i].init(*gpu);
-  }  
-  
+  }
+
   for (int i=0; i<_data_out_estimate; i++) {
     host_data_out[i].alloc(1,*gpu);
     dev_data_out[i].alloc(1,*gpu);
     timers_out[i].init(*gpu);
-  }  
-  
+  }
+
   for (int i=0; i<kernel_calls; i++) {
     kernel_data[i].alloc(1,*gpu);
     timers_kernel[i].init(*gpu);
-  }  
-  
+  }
+
   gpu_overhead=0.0;
   gpu_driver_overhead=0.0;
-  
+
   for (int i=0; i<10; i++) {
     gpu->sync();
     gpu_barrier();
@@ -439,7 +439,7 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
       ucl_copy(dev_data_in[i],host_data_in[i],true);
       timers_in[i].stop();
     }
-    
+
     for (int i=0; i<kernel_calls; i++) {
       timers_kernel[i].start();
       zero(kernel_data[i],1);
@@ -455,7 +455,7 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
 
     double time=over_timer.seconds();
     driver_time=MPI_Wtime()-driver_time;
-     
+
     if (time_device()) {
       for (int i=0; i<_data_in_estimate; i++)
         timers_in[i].add_to_total();
@@ -464,7 +464,7 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
       for (int i=0; i<_data_out_estimate; i++)
         timers_out[i].add_to_total();
     }
-    
+
     double mpi_time, mpi_driver_time;
     MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
     MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
@@ -479,24 +479,24 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
     delete [] dev_data_in;
     delete [] timers_in;
   }
-  
+
   if (_data_out_estimate>0) {
     delete [] host_data_out;
     delete [] dev_data_out;
     delete [] timers_out;
   }
-  
+
   if (kernel_calls>0) {
     delete [] kernel_data;
     delete [] timers_kernel;
   }
-}              
+}
 
 template <class numtyp, class acctyp>
-void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans, 
-                           Neighbor &nbor, const double avg_split, 
+void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
+                           Neighbor &nbor, const double avg_split,
                            const double max_bytes, const double gpu_overhead,
-                           const double driver_overhead, 
+                           const double driver_overhead,
                            const int threads_per_atom, FILE *screen) {
   double single[9], times[9];
   int post_final=0;
@@ -557,14 +557,14 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
 }
 
 template <class numtyp, class acctyp>
-void DeviceT::output_kspace_times(UCL_Timer &time_in, 
+void DeviceT::output_kspace_times(UCL_Timer &time_in,
                                   UCL_Timer &time_out,
                                   UCL_Timer &time_map,
                                   UCL_Timer &time_rho,
                                   UCL_Timer &time_interp,
-                                  Answer<numtyp,acctyp> &ans, 
-                                  const double max_bytes, 
-                                  const double cpu_time, 
+                                  Answer<numtyp,acctyp> &ans,
+                                  const double max_bytes,
+                                  const double cpu_time,
                                   const double idle_time, FILE *screen) {
   double single[8], times[8];
 
@@ -650,8 +650,8 @@ int DeviceT::compile_kernels() {
   int flag=0;
 
   if (_compiled)
-  	return flag;
-  	
+          return flag;
+
   dev_program=new UCL_Program(*gpu);
   int success=dev_program->load_string(device,compile_string().c_str());
   if (success!=UCL_SUCCESS)
@@ -664,7 +664,7 @@ int DeviceT::compile_kernels() {
   k_info.set_size(1,1);
   k_info.run(&gpu_lib_data);
   gpu_lib_data.update_host(false);
-  
+
   _ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0;
   #ifndef USE_OPENCL
   if (_ptx_arch>gpu->arch() || floor(_ptx_arch)<floor(gpu->arch()))
@@ -705,7 +705,7 @@ int DeviceT::compile_kernels() {
   if (_threads_per_charge & (_threads_per_charge - 1))
     _threads_per_charge=1;
 
-  return flag;    
+  return flag;
 }
 
 template <class numtyp, class acctyp>
@@ -718,12 +718,12 @@ template class Device<PRECISION,ACC_PRECISION>;
 Device<PRECISION,ACC_PRECISION> global_device;
 
 int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
-                    const int last_gpu, const int gpu_mode, 
+                    const int last_gpu, const int gpu_mode,
                     const double particle_split, const int nthreads,
-                    const int t_per_atom, const double cell_size, 
+                    const int t_per_atom, const double cell_size,
                     char *opencl_vendor, const int block_pair) {
   return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
-                                   particle_split,nthreads,t_per_atom, 
+                                   particle_split,nthreads,t_per_atom,
                                    cell_size,opencl_vendor,block_pair);
 }
 
diff --git a/lib/gpu/lal_device.cu b/lib/gpu/lal_device.cu
index 28b58f7760..6761b23fbb 100644
--- a/lib/gpu/lal_device.cu
+++ b/lib/gpu/lal_device.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -17,10 +17,10 @@
 #include "lal_preprocessor.h"
 #endif
 
-__kernel void kernel_zero(__global int *restrict mem, 
+__kernel void kernel_zero(__global int *restrict mem,
                           int numel) {
   int ii=GLOBAL_ID_X;
-  
+
   if (ii<numel)
     mem[ii]=0;
 }
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index 77321f5462..4f7b594c7c 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -28,29 +28,29 @@
 
 namespace LAMMPS_AL {
 
-template <class numtyp, class acctyp, 
+template <class numtyp, class acctyp,
           class grdtyp, class grdtyp4> class PPPM;
 
 template <class numtyp, class acctyp>
 class Device {
  public:
   Device();
-  ~Device(); 
- 
+  ~Device();
+
   /// Initialize the device for use by this process
   /** Sets up a per-device MPI communicator for load balancing and initializes
-    * the device (>=first_gpu and <=last_gpu) that this proc will be using 
+    * the device (>=first_gpu and <=last_gpu) that this proc will be using
     * Returns:
     * -  0 if successfull
     * - -2 if GPU not found
     * - -4 if GPU library not compiled for GPU
     * - -6 if GPU could not be initialized for use
-    * - -7 if accelerator sharing is not currently allowed on system 
+    * - -7 if accelerator sharing is not currently allowed on system
     * - -11 if vendor_string has the wrong number of parameters **/
-  int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
-                  const int last_gpu, const int gpu_mode, 
+  int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
+                  const int last_gpu, const int gpu_mode,
                   const double particle_split, const int nthreads,
-                  const int t_per_atom, const double cell_size, 
+                  const int t_per_atom, const double cell_size,
                   char *vendor_string, const int block_pair);
 
   /// Initialize the device for Atom and Neighbor storage
@@ -62,9 +62,9 @@ class Device {
     *                 1 if gpu_nbor is true, and host needs a half nbor list,
     *                 2 if gpu_nbor is true, and host needs a full nbor list
     * \param max_nbors Initial number of rows in the neighbor matrix
-    * \param cell_size cutoff+skin 
+    * \param cell_size cutoff+skin
     * \param pre_cut True if cutoff test will be performed in separate kernel
-    *                than the force kernel 
+    *                than the force kernel
     * \param threads_per_atom value to be used by the neighbor list only
     *
     * Returns:
@@ -113,25 +113,25 @@ class Device {
 
   /// Returns true if double precision is supported on card
   inline bool double_precision() { return gpu->double_precision(); }
-  
+
   /// Output a message with timing information
-  void output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans, 
-                    Neighbor &nbor, const double avg_split, 
+  void output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
+                    Neighbor &nbor, const double avg_split,
                     const double max_bytes, const double gpu_overhead,
-                    const double driver_overhead, 
+                    const double driver_overhead,
                     const int threads_per_atom, FILE *screen);
 
   /// Output a message with timing information
   void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
                            UCL_Timer & time_map, UCL_Timer & time_rho,
-                           UCL_Timer &time_interp, 
-                           Answer<numtyp,acctyp> &ans, 
+                           UCL_Timer &time_interp,
+                           Answer<numtyp,acctyp> &ans,
                            const double max_bytes, const double cpu_time,
                            const double cpu_idle_time, FILE *screen);
 
   /// Clear all memory on host and device associated with atom and nbor data
   void clear();
-  
+
   /// Clear all memory on host and device
   void clear_device();
 
@@ -149,24 +149,24 @@ class Device {
       while (ans_queue.empty()==false) {
         evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
         ans_queue.pop();
-      }                                                 
+      }
       return evdw;
     }
     return 0.0;
   }
 
   /// Start timer on host
-  inline void start_host_timer() 
+  inline void start_host_timer()
     { _cpu_full=MPI_Wtime(); _host_timer_started=true; }
-  
+
   /// Stop timer on host
-  inline void stop_host_timer() { 
+  inline void stop_host_timer() {
     if (_host_timer_started) {
-      _cpu_full=MPI_Wtime()-_cpu_full; 
+      _cpu_full=MPI_Wtime()-_cpu_full;
       _host_timer_started=false;
     }
   }
-  
+
   /// Return host time
   inline double host_time() { return _cpu_full; }
 
@@ -239,8 +239,8 @@ class Device {
   /// Number of threads executing concurrently on same multiproc
   inline int warp_size() const { return _warp_size; }
 
-  // -------------------- SHARED DEVICE ROUTINES -------------------- 
-  // Perform asynchronous zero of integer array 
+  // -------------------- SHARED DEVICE ROUTINES --------------------
+  // Perform asynchronous zero of integer array
   void zero(UCL_D_Vec<int> &mem, const int numel) {
     int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
                                     _block_pair));
@@ -248,25 +248,25 @@ class Device {
     k_zero.run(&mem,&numel);
   }
 
-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------
 
   /// Geryon Device
   UCL_Device *gpu;
 
   enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH};
 
-  // --------------------------- ATOM DATA -------------------------- 
+  // --------------------------- ATOM DATA --------------------------
 
   /// Atom Data
   Atom<numtyp,acctyp> atom;
 
   // --------------------------- NBOR DATA ----------------------------
-  
+
   /// Neighbor Data
   NeighborShared _neighbor_shared;
 
   // ------------------------ LONG RANGE DATA -------------------------
-  
+
   // Long Range Data
   int _long_range_precompute;
   PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
@@ -282,7 +282,7 @@ class Device {
       pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
                               boxlo,prd);
   }
-  
+
   inline std::string compile_string() { return _ocl_compile_string; }
 
  private:
@@ -290,7 +290,7 @@ class Device {
   int _init_count;
   bool _device_init, _host_timer_started, _time_device;
   MPI_Comm _comm_world, _comm_replica, _comm_gpu;
-  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, 
+  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
       _replica_size;
   int _gpu_mode, _first_device, _last_device, _nthreads;
   double _particle_split;
@@ -310,10 +310,10 @@ class Device {
   int compile_kernels();
 
   int _data_in_estimate, _data_out_estimate;
-  
+
   std::string _ocl_vendor_name, _ocl_vendor_string, _ocl_compile_string;
   int set_ocl_params(char *);
-  
+
   template <class t>
   inline std::string toa(const t& in) {
     std::ostringstream o;
diff --git a/lib/gpu/lal_dipole_lj.cpp b/lib/gpu/lal_dipole_lj.cpp
index e96e15eaf9..c97b76c820 100644
--- a/lib/gpu/lal_dipole_lj.cpp
+++ b/lib/gpu/lal_dipole_lj.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 DipoleLJT::~DipoleLJ() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int DipoleLJT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,8 +45,8 @@ int DipoleLJT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int DipoleLJT::init(const int ntypes,
-                    double **host_cutsq, double **host_lj1, 
-                    double **host_lj2, double **host_lj3, 
+                    double **host_cutsq, double **host_lj1,
+                    double **host_lj2, double **host_lj3,
                     double **host_lj4, double **host_offset,
                     double *host_special_lj, const int nlocal,
                     const int nall, const int max_nbors,
@@ -138,7 +138,7 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -151,7 +151,7 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) {
                           &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->q, 
+                          &ainum, &nbor_pitch, &this->atom->q,
                           &this->atom->quat, &cutsq,
                           &_qqrd2e, &this->_threads_per_atom);
   } else {
@@ -160,8 +160,8 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) {
                      &_lj_types, &sp_lj, &this->nbor->dev_nbor,
                      &this->_nbor_data->begin(), &this->ans->force,
                      &this->ans->engv, &eflag, &vflag, &ainum,
-                     &nbor_pitch, &this->atom->q, 
-                     &this->atom->quat, &cutsq, 
+                     &nbor_pitch, &this->atom->q,
+                     &this->atom->quat, &cutsq,
                      &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_dipole_lj.cu b/lib/gpu/lal_dipole_lj.cu
index b6483d1ef8..745bdb7f27 100644
--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -132,17 +132,17 @@ texture<int4,1> mu_tex;
 
 #endif
 
-__kernel void k_dipole_lj(const __global numtyp4 *restrict x_, 
+__kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict lj1,
-                          const __global numtyp4 *restrict lj3, 
-                          const int lj_types, 
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
+                          const __global numtyp4 *restrict lj3,
+                          const int lj_types,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
+                          __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
-                          const int nbor_pitch, 
+                          const int nbor_pitch,
                           const __global numtyp *restrict q_,
                           const __global numtyp4 *restrict mu_,
                           const __global numtyp *restrict cutsq,
@@ -171,14 +171,14 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
@@ -225,20 +225,20 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
           rinv = ucl_rsqrt(rsq);
 
           // charge-charge
-          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) { 
+          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) {
             r3inv = r2inv*rinv;
             pre1 = qtmp*qj*r3inv;
 
             forcecoul.x += pre1*delx;
             forcecoul.y += pre1*dely;
             forcecoul.z += pre1*delz;
-          }                    
+          }
 
           // dipole-dipole
           if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
-            r3inv = r2inv*rinv; 
+            r3inv = r2inv*rinv;
             r5inv = r3inv*r2inv;
-	          r7inv = r5inv*r2inv;
+                  r7inv = r5inv*r2inv;
             pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
             pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
             pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
@@ -251,7 +251,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
             forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x;
             forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y;
             forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z;
-	    
+
             numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
             numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
             numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
@@ -263,12 +263,12 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
 
           // dipole-charge
           if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
-            r3inv = r2inv*rinv; 
+            r3inv = r2inv*rinv;
             r5inv = r3inv*r2inv;
             pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
             pre1 = (numtyp)3.0*qj*r5inv * pidotr;
             pre2 = qj*r3inv;
-              
+
             forcecoul.x += pre2*mui.x - pre1*delx;
             forcecoul.y += pre2*mui.y - pre1*dely;
             forcecoul.z += pre2*mui.z - pre1*delz;
@@ -276,7 +276,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
             ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
             ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
           }
-          
+
           // charge-dipole
           if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
             r3inv = r2inv*rinv;
@@ -284,7 +284,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
             pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
             pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr;
             pre2 = qtmp*r3inv;
-            
+
             forcecoul.x += pre1*delx - pre2*muj.x;
             forcecoul.y += pre1*dely - pre2*muj.y;
             forcecoul.z += pre1*delz - pre2*muj.z;
@@ -306,12 +306,12 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
         tor.z+=fq*ticoul.z;
 
         if (eflag>0) {
-          acctyp e = (acctyp)0.0;  
+          acctyp e = (acctyp)0.0;
           if (rsq < lj1[mtype].w) {
             e = qtmp*qj*rinv;
             if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
               e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr;
-            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) 
+            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0)
               e += -qj*r3inv*pidotr;
             if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
               e += qtmp*r3inv*pjdotr;
@@ -322,7 +322,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
           if (rsq < lj1[mtype].z) {
             e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*force.x;
@@ -340,19 +340,19 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp4 *restrict lj1_in,
-                               const __global numtyp4 *restrict lj3_in, 
+                               const __global numtyp4 *restrict lj3_in,
                                const __global numtyp *restrict sp_lj_in,
-                               const __global int *dev_nbor, 
+                               const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans, 
-                               __global acctyp *restrict engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, 
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch,
                                const __global numtyp *restrict q_,
                                const __global numtyp4 *restrict mu_,
-                               const __global numtyp *restrict _cutsq, 
+                               const __global numtyp *restrict _cutsq,
                                const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -369,7 +369,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -381,16 +381,16 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
@@ -424,7 +424,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
         numtyp pdotp, pidotr, pjdotr;
         acctyp4 forcecoul, ticoul;
         acctyp4 force;
-        
+
         forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
         ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
 
@@ -437,7 +437,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
           rinv = ucl_rsqrt(rsq);
 
           // charge-charge
-          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) { 
+          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) {
             r3inv = r2inv*rinv;
             pre1 = qtmp*qj*r3inv;
 
@@ -448,7 +448,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
 
           // dipole-dipole
           if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
-            r3inv = r2inv*rinv; 
+            r3inv = r2inv*rinv;
             r5inv = r3inv*r2inv;
             r7inv = r5inv*r2inv;
             pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
@@ -463,7 +463,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
             forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x;
             forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y;
             forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z;
-	    
+
             numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
             numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
             numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
@@ -474,13 +474,13 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
           }
 
           // dipole-charge
-          if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { 
+          if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
             r3inv = r2inv*rinv;
             r5inv = r3inv*r2inv;
             pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
             pre1 = (numtyp)3.0*qj*r5inv * pidotr;
             pre2 = qj*r3inv;
-            
+
             forcecoul.x += pre2*mui.x - pre1*delx;
             forcecoul.y += pre2*mui.y - pre1*dely;
             forcecoul.z += pre2*mui.z - pre1*delz;
@@ -488,7 +488,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
             ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
             ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
           }
-          
+
           // charge-dipole
           if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
             r3inv = r2inv*rinv;
@@ -496,7 +496,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
             pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
             pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr;
             pre2 = qtmp*r3inv;
-            
+
             forcecoul.x += pre1*delx - pre2*muj.x;
             forcecoul.y += pre1*dely - pre2*muj.y;
             forcecoul.z += pre1*delz - pre2*muj.z;
@@ -519,12 +519,12 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
         tor.z+=fq*ticoul.z;
 
         if (eflag>0) {
-          acctyp e = (acctyp)0;  
+          acctyp e = (acctyp)0;
           if (rsq < lj1[mtype].w) {
             e = qtmp*qj*rinv;
             if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
               e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr;
-            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) 
+            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0)
               e += -qj*r3inv*pidotr;
             if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
               e += qtmp*r3inv*pjdotr;
diff --git a/lib/gpu/lal_dipole_lj.h b/lib/gpu/lal_dipole_lj.h
index b08b7a8669..615784ee8b 100644
--- a/lib/gpu/lal_dipole_lj.h
+++ b/lib/gpu/lal_dipole_lj.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class DipoleLJ : public BaseDipole<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,7 +40,7 @@ class DipoleLJ : public BaseDipole<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, double **host_lj1,
            double **host_lj2, double **host_lj3, double **host_lj4,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            double **host_cut_coulsq, double *host_special_coul,
@@ -70,7 +70,7 @@ class DipoleLJ : public BaseDipole<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e;
diff --git a/lib/gpu/lal_dipole_lj_ext.cpp b/lib/gpu/lal_dipole_lj_ext.cpp
index 55bbe0b804..76722a20b4 100644
--- a/lib/gpu/lal_dipole_lj_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -81,7 +81,7 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          host_cut_coulsq, host_special_coul, qqrd2e);
 
     DPLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -98,18 +98,18 @@ void dpl_gpu_clear() {
 
 int** dpl_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                        double *sublo, double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
-                        bool &success, double *host_q, double **host_mu, 
+                        bool &success, double *host_q, double **host_mu,
                         double *boxlo, double *prd) {
   return DPLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, host_mu, boxlo, prd);
-}  
-			
+}
+
 void dpl_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_dipole_lj_sf.cpp b/lib/gpu/lal_dipole_lj_sf.cpp
index 5a145dc762..a33f38084f 100644
--- a/lib/gpu/lal_dipole_lj_sf.cpp
+++ b/lib/gpu/lal_dipole_lj_sf.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 DipoleLJSFT::~DipoleLJSF() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int DipoleLJSFT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,8 +45,8 @@ int DipoleLJSFT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int DipoleLJSFT::init(const int ntypes,
-                      double **host_cutsq, double **host_lj1, 
-                      double **host_lj2, double **host_lj3, 
+                      double **host_cutsq, double **host_lj1,
+                      double **host_lj2, double **host_lj3,
                       double **host_lj4,
                       double *host_special_lj, const int nlocal,
                       const int nall, const int max_nbors,
@@ -138,7 +138,7 @@ void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -151,17 +151,17 @@ void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) {
                           &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
-                          &ainum, &nbor_pitch, &this->atom->q, 
+                          &ainum, &nbor_pitch, &this->atom->q,
                           &this->atom->quat, &cutsq,
                           &_qqrd2e, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &lj1, &lj3,
                      &_lj_types, &sp_lj, &this->nbor->dev_nbor,
-                     &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, &eflag, &vflag, 
-                     &ainum, &nbor_pitch, &this->atom->q, 
-                     &this->atom->quat, &cutsq, 
+                     &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->atom->q,
+                     &this->atom->quat, &cutsq,
                      &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_dipole_lj_sf.cu b/lib/gpu/lal_dipole_lj_sf.cu
index 8469ed9ac9..9847e84823 100644
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -133,20 +133,20 @@ texture<int4,1> mu_tex;
 
 #endif
 
-__kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, 
+__kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict lj1,
-                             const __global numtyp4 *restrict lj3, 
-                             const int lj_types, 
-                             const __global numtyp *restrict sp_lj_in, 
-                             const __global int *dev_nbor, 
-                             const __global int *dev_packed, 
+                             const __global numtyp4 *restrict lj3,
+                             const int lj_types,
+                             const __global numtyp *restrict sp_lj_in,
+                             const __global int *dev_nbor,
+                             const __global int *dev_packed,
                              __global acctyp4 *restrict ans,
-                             __global acctyp *restrict engv, 
+                             __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
-                             const int nbor_pitch, 
+                             const int nbor_pitch,
                              const __global numtyp *restrict q_ ,
                              const __global numtyp4 *restrict mu_,
-                             const __global numtyp *restrict cutsq, 
+                             const __global numtyp *restrict cutsq,
                              const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -172,14 +172,14 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
@@ -236,48 +236,48 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
           rcutcoul2inv = ucl_recip(lj1[mtype].w);
 
           // charge-charge
-          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) { 
+          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) {
             r3inv = r2inv*rinv;
             pre1 = qtmp*qj*rinv*(r2inv-rcutcoul2inv);
 
             forcecoul.x += pre1*delx;
             forcecoul.y += pre1*dely;
             forcecoul.z += pre1*delz;
-          }                    
+          }
 
           // dipole-dipole
           if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
-            r3inv = r2inv*rinv; 
+            r3inv = r2inv*rinv;
             r5inv = r3inv*r2inv;
-	          
+
             pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
             pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
             pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
-            
+
             afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv;
             pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr);
             aforcecoul.x = pre1*delx;
             aforcecoul.y = pre1*dely;
             aforcecoul.z = pre1*delz;
- 	    
+
             bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+
               (numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv;
             presf = (numtyp)2.0*r2inv*pidotr*pjdotr;
             bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx);
             bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely);
             bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz);
-   
+
             forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x);
             forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y);
             forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z);
-            
+
             pre2 = (numtyp)3.0*bfac*r5inv*pjdotr;
             pre4 = -bfac*r3inv;
 
             numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
             numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
             numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
-  
+
             ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
             ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
             ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
@@ -285,12 +285,12 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
 
           // dipole-charge
           if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
-            r3inv = r2inv*rinv; 
+            r3inv = r2inv*rinv;
             r5inv = r3inv*r2inv;
             pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
             rcutcoul2inv=ucl_recip(lj1[mtype].w);
             pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv);
-            pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + 
+            pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv +
               (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
             pre2 = qj*r3inv * pqfac;
 
@@ -301,7 +301,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
             ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
             ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
           }
-          
+
           // charge-dipole
           if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
             r3inv = r2inv*rinv;
@@ -309,10 +309,10 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
             pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
             rcutcoul2inv=ucl_recip(lj1[mtype].w);
             pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv);
-            qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + 
+            qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv +
               (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
             pre2 = qtmp*r3inv * qpfac;
-            
+
             forcecoul.x += pre1*delx - pre2*muj.x;
             forcecoul.y += pre1*dely - pre2*muj.y;
             forcecoul.z += pre1*delz - pre2*muj.z;
@@ -334,13 +334,13 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
         tor.z+=fq*ticoul.z;
 
         if (eflag>0) {
-          acctyp e = (acctyp)0.0;  
+          acctyp e = (acctyp)0.0;
           if (rsq < lj1[mtype].w) {
             numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv);
             e = qtmp*qj*rinv*fac*fac;
             if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
               e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr);
-            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) 
+            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0)
               e += -qj*r3inv*pidotr * pqfac;
             if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
               e += qtmp*r3inv*pjdotr * qpfac;
@@ -350,12 +350,12 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
 
           if (rsq < lj1[mtype].z) {
             e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) +
-              rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - 
+              rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv -
               (numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv +
-              rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + 
+              rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv +
               (numtyp)4.0*lj3[mtype].y);
             energy+=factor_lj*e;
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*force.x;
@@ -372,19 +372,19 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
                                   const __global numtyp4 *restrict lj1_in,
-                                  const __global numtyp4 *restrict lj3_in, 
+                                  const __global numtyp4 *restrict lj3_in,
                                   const __global numtyp *restrict sp_lj_in,
-                                  const __global int *dev_nbor, 
+                                  const __global int *dev_nbor,
                                   const __global int *dev_packed,
-                                  __global acctyp4 *restrict ans, 
-                                  __global acctyp *restrict engv, 
-                                  const int eflag, const int vflag, 
-                                  const int inum, const int nbor_pitch, 
+                                  __global acctyp4 *restrict ans,
+                                  __global acctyp *restrict engv,
+                                  const int eflag, const int vflag,
+                                  const int inum, const int nbor_pitch,
                                   const __global numtyp *restrict q_,
                                   const __global numtyp4 *restrict mu_,
-                                  const __global numtyp *restrict _cutsq, 
+                                  const __global numtyp *restrict _cutsq,
                                   const numtyp qqrd2e,
                                   const int t_per_atom) {
   int tid, ii, offset;
@@ -402,7 +402,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -414,16 +414,16 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
@@ -480,41 +480,41 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
           rcutcoul2inv = ucl_recip(lj1[mtype].w);
 
           // charge-charge
-          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) { 
+          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) {
             r3inv = r2inv*rinv;
             pre1 = qtmp*qj*rinv*(r2inv-rcutcoul2inv);
 
             forcecoul.x += pre1*delx;
             forcecoul.y += pre1*dely;
             forcecoul.z += pre1*delz;
-          }                    
+          }
 
           // dipole-dipole
           if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
-            r3inv = r2inv*rinv; 
+            r3inv = r2inv*rinv;
             r5inv = r3inv*r2inv;
-	          
+
             pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
             pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
             pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
-            
+
             afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv;
             pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr);
             aforcecoul.x = pre1*delx;
             aforcecoul.y = pre1*dely;
             aforcecoul.z = pre1*delz;
-	    
+
             bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+
               (numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv;
             presf = (numtyp)2.0*r2inv*pidotr*pjdotr;
             bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx);
             bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely);
             bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz);
-	    
+
             forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x);
             forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y);
             forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z);
-            
+
             pre2 = (numtyp)3.0*bfac*r5inv*pjdotr;
             pre4 = -bfac*r3inv;
 
@@ -529,11 +529,11 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
 
           // dipole-charge
           if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
-            r3inv = r2inv*rinv; 
+            r3inv = r2inv*rinv;
             r5inv = r3inv*r2inv;
             pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
             pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv);
-            pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + 
+            pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv +
               (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
             pre2 = qj*r3inv * pqfac;
 
@@ -544,7 +544,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
             ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
             ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
           }
-          
+
           // charge-dipole
           if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
             r3inv = r2inv*rinv;
@@ -552,10 +552,10 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
             pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
 
             pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv);
-            qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + 
+            qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv +
               (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
             pre2 = qtmp*r3inv * qpfac;
-            
+
             forcecoul.x += pre1*delx - pre2*muj.x;
             forcecoul.y += pre1*dely - pre2*muj.y;
             forcecoul.z += pre1*delz - pre2*muj.z;
@@ -577,13 +577,13 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
         tor.z+=fq*ticoul.z;
 
         if (eflag>0) {
-          acctyp e = (acctyp)0.0;  
+          acctyp e = (acctyp)0.0;
           if (rsq < lj1[mtype].w) {
             numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv);
             e = qtmp*qj*rinv*fac*fac;
             if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
               e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr);
-            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) 
+            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0)
               e += -qj*r3inv*pidotr * pqfac;
             if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
               e += qtmp*r3inv*pjdotr * qpfac;
@@ -593,12 +593,12 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
 
           if (rsq < lj1[mtype].z) {
             e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) +
-              rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - 
+              rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv -
               (numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv +
-              rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + 
+              rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv +
               (numtyp)4.0*lj3[mtype].y);
             energy+=factor_lj*e;
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*force.x;
diff --git a/lib/gpu/lal_dipole_lj_sf.h b/lib/gpu/lal_dipole_lj_sf.h
index 83cea4c2a4..20357385a2 100644
--- a/lib/gpu/lal_dipole_lj_sf.h
+++ b/lib/gpu/lal_dipole_lj_sf.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class DipoleLJSF : public BaseDipole<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,7 +40,7 @@ class DipoleLJSF : public BaseDipole<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, double **host_lj1,
            double **host_lj2, double **host_lj3, double **host_lj4,
            double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            double **host_cut_coulsq, double *host_special_coul,
@@ -70,7 +70,7 @@ class DipoleLJSF : public BaseDipole<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e;
diff --git a/lib/gpu/lal_dipole_lj_sf_ext.cpp b/lib/gpu/lal_dipole_lj_sf_ext.cpp
index 8abf78c903..68b935ff38 100644
--- a/lib/gpu/lal_dipole_lj_sf_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_sf_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -81,7 +81,7 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                            host_cut_coulsq, host_special_coul, qqrd2e);
 
     DPLSFMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -98,18 +98,18 @@ void dplsf_gpu_clear() {
 
 int** dplsf_gpu_compute_n(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
-                          double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                          double *sublo, double *subhi, tagint *tag, int **nspecial,
                           tagint **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
-                          bool &success, double *host_q, double **host_mu, 
+                          bool &success, double *host_q, double **host_mu,
                           double *boxlo, double *prd) {
   return DPLSFMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                          subhi, tag, nspecial, special, eflag, vflag, eatom,
                          vatom, host_start, ilist, jnum, cpu_time, success,
                          host_q, host_mu, boxlo, prd);
-}  
-			
+}
+
 void dplsf_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_dpd.cpp b/lib/gpu/lal_dpd.cpp
index 3736f89323..4f6f2d641f 100644
--- a/lib/gpu/lal_dpd.cpp
+++ b/lib/gpu/lal_dpd.cpp
@@ -33,23 +33,23 @@ DPDT::DPD() : BaseDPD<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-DPDT::~DPD() { 
+DPDT::~DPD() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int DPDT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int DPDT::init(const int ntypes, 
-               double **host_cutsq, double **host_a0, 
-               double **host_gamma, double **host_sigma, 
+int DPDT::init(const int ntypes,
+               double **host_cutsq, double **host_a0,
+               double **host_gamma, double **host_sigma,
                double **host_cut, double *host_special_lj,
-               const bool tstat_only, 
-               const int nlocal, const int nall, 
-               const int max_nbors, const int maxspecial, 
+               const bool tstat_only,
+               const int nlocal, const int nall,
+               const int max_nbors, const int maxspecial,
                const double cell_size,
                const double gpu_split, FILE *_screen) {
   int success;
@@ -76,7 +76,7 @@ int DPDT::init(const int ntypes,
 
   coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a0,host_gamma,
-			 host_sigma,host_cut);
+                         host_sigma,host_cut);
 
   UCL_H_Vec<numtyp> host_rsq(lj_types*lj_types,*(this->ucl_device),
                              UCL_WRITE_ONLY);
@@ -90,7 +90,7 @@ int DPDT::init(const int ntypes,
 
   _tstat_only = 0;
   if (tstat_only) _tstat_only=1;
-  
+
   _allocated=true;
   this->_max_bytes=coeff.row_bytes()+cutsq.row_bytes()+sp_lj.row_bytes();
   return 0;
@@ -130,7 +130,7 @@ void DPDT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -147,8 +147,8 @@ void DPDT::loop(const bool _eflag, const bool _vflag) {
                           &this->_tstat_only, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &coeff, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &coeff, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->atom->v, &cutsq, &this->_dtinvsqrt,
                      &this->_seed, &this->_timestep, &this->_tstat_only,
@@ -164,7 +164,7 @@ void DPDT::update_coeff(int ntypes, double **host_a0, double **host_gamma,
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
   this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_a0,host_gamma,
-			 host_sigma,host_cut);
+                         host_sigma,host_cut);
 }
-               
+
 template class DPD<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_dpd.cu b/lib/gpu/lal_dpd.cu
index 209bc0233e..e32404ff5c 100644
--- a/lib/gpu/lal_dpd.cu
+++ b/lib/gpu/lal_dpd.cu
@@ -37,7 +37,7 @@ texture<int4,1> vel_tex;
 #define _USE_UNIFORM_SARU_LCG
 #endif
 
-// References: 
+// References:
 // 1. Y. Afshar, F. Schmid, A. Pishevar, S. Worley, Comput. Phys. Comm. 184 (2013), 1119–1128.
 // 2. C. L. Phillips, J. A. Anderson, S. C. Glotzer, Comput. Phys. Comm. 230 (2011), 7191-7201.
 // PRNG period = 3666320093*2^32 ~ 2^64 ~ 10^19
@@ -49,9 +49,9 @@ texture<int4,1> vel_tex;
 #define TWO_N32 0.232830643653869628906250e-9f /* 2^-32 */
 
 // specifically implemented for steps = 1; high = 1.0; low = -1.0
-// returns uniformly distributed random numbers u in [-1.0;1.0] 
-// using the inherent LCG, then multiply u with sqrt(3) to "match" 
-// with a normal random distribution. 
+// returns uniformly distributed random numbers u in [-1.0;1.0]
+// using the inherent LCG, then multiply u with sqrt(3) to "match"
+// with a normal random distribution.
 // Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12)
 // Curly brackets to make variables local to the scope.
 #ifdef _USE_UNIFORM_SARU_LCG
@@ -80,8 +80,8 @@ texture<int4,1> vel_tex;
 #endif
 
 // specifically implemented for steps = 1; high = 1.0; low = -1.0
-// returns uniformly distributed random numbers u in [-1.0;1.0] using TEA8 
-// then multiply u with sqrt(3) to "match" with a normal random distribution 
+// returns uniformly distributed random numbers u in [-1.0;1.0] using TEA8
+// then multiply u with sqrt(3) to "match" with a normal random distribution
 // Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12)
 #ifdef _USE_UNIFORM_SARU_TEA8
 #define SQRT3 (numtyp)1.7320508075688772935274463
@@ -119,7 +119,7 @@ texture<int4,1> vel_tex;
 #endif
 
 // specifically implemented for steps = 1; high = 1.0; low = -1.0
-// returns two uniformly distributed random numbers r1 and r2 in [-1.0;1.0], 
+// returns two uniformly distributed random numbers r1 and r2 in [-1.0;1.0],
 // and uses the polar method (Marsaglia's) to transform to a normal random value
 // This is used to compared with CPU DPD using RandMars::gaussian()
 #ifdef _USE_GAUSSIAN_SARU_LCG
@@ -160,20 +160,20 @@ texture<int4,1> vel_tex;
   randnum = r2*fac;                                                           \
 }
 #endif
-                                                                             
-__kernel void k_dpd(const __global numtyp4 *restrict x_, 
+
+__kernel void k_dpd(const __global numtyp4 *restrict x_,
                     const __global numtyp4 *restrict coeff,
-                    const int lj_types, 
-                    const __global numtyp *restrict sp_lj, 
-                    const __global int * dev_nbor, 
-                    const __global int * dev_packed, 
-                    __global acctyp4 *restrict ans, 
-                    __global acctyp *restrict engv, 
+                    const int lj_types,
+                    const __global numtyp *restrict sp_lj,
+                    const __global int * dev_nbor,
+                    const __global int * dev_packed,
+                    __global acctyp4 *restrict ans,
+                    __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
-                    const int nbor_pitch, 
+                    const int nbor_pitch,
                     const __global numtyp4 *restrict v_,
                     const __global numtyp *restrict cutsq,
-                    const numtyp dtinvsqrt, const int seed, 
+                    const numtyp dtinvsqrt, const int seed,
                     const int timestep, const int tstat_only,
                     const int t_per_atom) {
   int tid, ii, offset;
@@ -185,13 +185,13 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i];
@@ -199,7 +199,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
 
     numtyp factor_dpd;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_dpd = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -214,7 +214,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<cutsq[mtype]) {
         numtyp r=ucl_sqrt(rsq);
@@ -231,8 +231,8 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
         if (tag1 > tag2) {
           tag1 = jtag; tag2 = itag;
         }
-        
-        numtyp randnum = (numtyp)0.0;  
+
+        numtyp randnum = (numtyp)0.0;
         saru(tag1, tag2, seed, timestep, randnum);
 
         // conservative force = a0 * wd, or 0 if tstat only
@@ -244,7 +244,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
         force -= coeff[mtype].y*wd*wd*dot*rinv;
         force += coeff[mtype].z*wd*randnum*dtinvsqrt;
         force*=factor_dpd*rinv;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -254,7 +254,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
           // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
           // eng shifted to 0.0 at cutoff
           numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd;
-          energy+=factor_dpd*e; 
+          energy+=factor_dpd*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -272,23 +272,23 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_dpd_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict coeff_in,
-                         const __global numtyp *restrict sp_lj_in, 
-                         const __global int * dev_nbor, 
-                         const __global int * dev_packed, 
-                         __global acctyp4 *restrict ans, 
-                         __global acctyp *restrict engv, 
-                         const int eflag, const int vflag, const int inum, 
+                         const __global numtyp *restrict sp_lj_in,
+                         const __global int * dev_nbor,
+                         const __global int * dev_packed,
+                         __global acctyp4 *restrict ans,
+                         __global acctyp *restrict engv,
+                         const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
                          const __global numtyp4 *restrict v_,
                          const __global numtyp *restrict cutsq,
-                         const numtyp dtinvsqrt, const int seed, 
+                         const numtyp dtinvsqrt, const int seed,
                          const int timestep, const int tstat_only,
                          const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -296,7 +296,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -305,7 +305,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
@@ -320,7 +320,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_dpd;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_dpd = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -335,7 +335,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<cutsq[mtype]) {
         numtyp r=ucl_sqrt(rsq);
         if (r < EPSILON) continue;
@@ -351,8 +351,8 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
         if (tag1 > tag2) {
           tag1 = jtag; tag2 = itag;
         }
-        
-        numtyp randnum = (numtyp)0.0;  
+
+        numtyp randnum = (numtyp)0.0;
         saru(tag1, tag2, seed, timestep, randnum);
 
         // conservative force = a0 * wd, or 0 if tstat only
@@ -364,7 +364,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
         force -= coeff[mtype].y*wd*wd*dot*rinv;
         force += coeff[mtype].z*wd*randnum*dtinvsqrt;
         force*=factor_dpd*rinv;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -374,7 +374,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
           // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
           // eng shifted to 0.0 at cutoff
           numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd;
-          energy+=factor_dpd*e; 
+          energy+=factor_dpd*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_dpd.h b/lib/gpu/lal_dpd.h
index 449d7b1d8c..42ef854522 100644
--- a/lib/gpu/lal_dpd.h
+++ b/lib/gpu/lal_dpd.h
@@ -24,23 +24,23 @@ template <class numtyp, class acctyp>
 class DPD : public BaseDPD<numtyp, acctyp> {
  public:
   DPD();
-  ~DPD(); 
+  ~DPD();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq, double **host_a0, 
+  int init(const int ntypes, double **host_cutsq, double **host_a0,
            double **host_gamma, double **host_sigma, double **host_cut,
            double *host_special_lj, bool tstat_only, const int nlocal,
-           const int nall, const int max_nbors, const int maxspecial, 
+           const int nall, const int max_nbors, const int maxspecial,
            const double cell_size, const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
@@ -52,11 +52,11 @@ class DPD : public BaseDPD<numtyp, acctyp> {
 
   /// Total host memory used by library for pair style
   double host_memory_usage() const;
-  
+
   /// Update coeff if needed (tstat only)
   void update_coeff(int ntypes, double **host_a0, double **host_gamma,
                     double **host_sigma, double **host_cut);
-  
+
   // --------------------------- TYPE DATA --------------------------
 
   /// coeff.x = a0, coeff.y = gamma, coeff.z = sigma, coeff.w = cut
@@ -70,12 +70,12 @@ class DPD : public BaseDPD<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
-  
+
   /// Only used for thermostat
   int _tstat_only;
-  
+
  private:
   bool _allocated;
   void loop(const bool _eflag, const bool _vflag);
diff --git a/lib/gpu/lal_dpd_ext.cpp b/lib/gpu/lal_dpd_ext.cpp
index 327074d087..26bbb660b8 100644
--- a/lib/gpu/lal_dpd_ext.cpp
+++ b/lib/gpu/lal_dpd_ext.cpp
@@ -54,7 +54,7 @@ int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, 
+    init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma,
                        host_cut, special_lj, tstat_only, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
@@ -72,12 +72,12 @@ int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma, 
+      init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma,
                          host_cut, special_lj, tstat_only, inum, nall, 300,
                          maxspecial, cell_size, gpu_split, screen);
 
     DPDMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -95,25 +95,25 @@ void dpd_gpu_clear() {
 int ** dpd_gpu_compute_n(const int ago, const int inum_full, const int nall,
                          double **host_x, int *host_type, double *sublo,
                          double *subhi, tagint *tag, int **nspecial,
-                         tagint **special, const bool eflag, const bool vflag, 
-                         const bool eatom, const bool vatom, int &host_start, 
+                         tagint **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time, bool &success,
-                         double **host_v, const double dtinvsqrt, 
+                         double **host_v, const double dtinvsqrt,
                          const int seed, const int timestep,
                          double *boxlo, double *prd) {
   return DPDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
-                       vatom, host_start, ilist, jnum, cpu_time, success, 
+                       vatom, host_start, ilist, jnum, cpu_time, success,
                        host_v, dtinvsqrt, seed, timestep, boxlo, prd);
-}  
-			
+}
+
 void dpd_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success, tagint *tag,
-                     double **host_v, const double dtinvsqrt, 
-                     const int seed, const int timestep, 
+                     double **host_v, const double dtinvsqrt,
+                     const int seed, const int timestep,
                      const int nlocal, double *boxlo, double *prd) {
   DPDMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj,
                 firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success,
diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp
index c856a8e667..b83972f4db 100644
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@@ -9,10 +9,10 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
- 
+
 #if defined(USE_OPENCL)
 #include "eam_cl.h"
 #elif defined(USE_CUDART)
@@ -33,7 +33,7 @@ using namespace LAMMPS_AL;
 extern Device<PRECISION,ACC_PRECISION> device;
 
 template <class numtyp, class acctyp>
-EAMT::EAM() : BaseAtomic<numtyp,acctyp>(), 
+EAMT::EAM() : BaseAtomic<numtyp,acctyp>(),
   _compiled_energy(false), _allocated(false) {
 }
 
@@ -41,46 +41,46 @@ template <class numtyp, class acctyp>
 EAMT::~EAM() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
                int **host_type2z2r, int *host_type2frho,
                double ***host_rhor_spline, double ***host_z2r_spline,
-               double ***host_frho_spline, double rdr, double rdrho, 
+               double ***host_frho_spline, double rdr, double rdrho,
                double rhomax, int nrhor, int nrho, int nz2r, int nfrho, int nr,
                const int nlocal, const int nall, const int max_nbors,
-               const int maxspecial, const double cell_size, 
-               const double gpu_split, FILE *_screen) 
+               const int maxspecial, const double cell_size,
+               const double gpu_split, FILE *_screen)
 {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
                             gpu_split,_screen,eam,"k_eam");
-  
+
   if (success!=0)
     return success;
-  
+
   // allocate fp
-  
+
   int ef_nall=nall;
   if (ef_nall==0)
     ef_nall=2000;
 
   _max_fp_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
   _fp.alloc(_max_fp_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-                                     
+
   k_energy.set_function(*(this->pair_program),"k_energy");
   k_energy_fast.set_function(*(this->pair_program),"k_energy_fast");
   fp_tex.get_texture(*(this->pair_program),"fp_tex");
   fp_tex.bind_float(_fp,1);
   _compiled_energy = true;
-  
+
   // Initialize timers for selected GPU
   time_pair2.init(*(this->ucl_device));
   time_pair2.zero();
-  
+
   time_fp1.init(*(this->ucl_device));
   time_fp1.zero();
-  
+
   time_fp2.init(*(this->ucl_device));
   time_fp2.zero();
 
@@ -93,7 +93,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
     lj_types=max_shared_types;
     shared_types=true;
   }
-  
+
   _ntypes=lj_types;
   _cutforcesq=host_cutforcesq;
   _rdr=rdr;
@@ -104,26 +104,26 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
   _nz2r=nz2r;
   _nfrho=nfrho;
   _nr=nr;
-  
+
   UCL_H_Vec<int2> dview_type(lj_types*lj_types,*(this->ucl_device),
                              UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<lj_types*lj_types; i++) {
     dview_type[i].x=0; dview_type[i].y=0;
   }
-                                
+
   // pack type2rhor and type2z2r
   type2rhor_z2r.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
-  
+
   for (int i=0; i<ntypes; i++) {
     for (int j=0; j<ntypes; j++) {
       dview_type[i*lj_types+j].x=host_type2rhor[i][j];
       dview_type[i*lj_types+j].y=host_type2z2r[i][j];
     }
   }
-  
+
   ucl_copy(type2rhor_z2r,dview_type,false);
-  
+
   // pack type2frho
   UCL_H_Vec<int> dview_type2frho(lj_types,*(this->ucl_device),
                                  UCL_WRITE_ONLY);
@@ -136,7 +136,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
   // pack frho_spline
   UCL_H_Vec<numtyp4> dview_frho_spline(nfrho*(nrho+1),*(this->ucl_device),
                                        UCL_WRITE_ONLY);
-                               
+
   for (int ix=0; ix<nfrho; ix++)
     for (int iy=0; iy<nrho+1; iy++) {
       dview_frho_spline[ix*(nrho+1)+iy].x=host_frho_spline[ix][iy][0];
@@ -166,7 +166,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
   // pack rhor_spline
   UCL_H_Vec<numtyp4> dview_rhor_spline(nrhor*(nr+1),*(this->ucl_device),
                                        UCL_WRITE_ONLY);
-                               
+
   for (int ix=0; ix<nrhor; ix++)
     for (int iy=0; iy<nr+1; iy++) {
       dview_rhor_spline[ix*(nr+1)+iy].x=host_rhor_spline[ix][iy][0];
@@ -196,7 +196,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
   // pack z2r_spline
   UCL_H_Vec<numtyp4> dview_z2r_spline(nz2r*(nr+1),*(this->ucl_device),
                                       UCL_WRITE_ONLY);
-                               
+
   for (int ix=0; ix<nz2r; ix++)
     for (int iy=0; iy<nr+1; iy++) {
       dview_z2r_spline[ix*(nr+1)+iy].x=host_z2r_spline[ix][iy][0];
@@ -204,12 +204,12 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
       dview_z2r_spline[ix*(nr+1)+iy].z=host_z2r_spline[ix][iy][2];
       dview_z2r_spline[ix*(nr+1)+iy].w=(numtyp)0;
     }
-  
+
   z2r_spline1.alloc(nz2r*(nr+1),*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(z2r_spline1,dview_z2r_spline,false);
   z2r_spline1_tex.get_texture(*(this->pair_program),"z2r_sp1_tex");
   z2r_spline1_tex.bind_float(z2r_spline1,4);
-  
+
   for (int ix=0; ix<nz2r; ix++)
     for (int iy=0; iy<nr+1; iy++) {
       dview_z2r_spline[ix*(nr+1)+iy].x=host_z2r_spline[ix][iy][3];
@@ -217,7 +217,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
       dview_z2r_spline[ix*(nr+1)+iy].z=host_z2r_spline[ix][iy][5];
       dview_z2r_spline[ix*(nr+1)+iy].w=host_z2r_spline[ix][iy][6];
     }
-  
+
   z2r_spline2.alloc(nz2r*(nr+1),*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(z2r_spline2,dview_z2r_spline,false);
   z2r_spline2_tex.get_texture(*(this->pair_program),"z2r_sp2_tex");
@@ -241,7 +241,7 @@ void EAMT::clear() {
   if (!_allocated)
     return;
   _allocated=false;
-  
+
   type2rhor_z2r.clear();
   type2frho.clear();
   rhor_spline1.clear();
@@ -250,13 +250,13 @@ void EAMT::clear() {
   frho_spline2.clear();
   z2r_spline1.clear();
   z2r_spline2.clear();
-  
+
   _fp.clear();
-  
+
   time_pair2.clear();
   time_fp1.clear();
   time_fp2.clear();
-  
+
   if (_compiled_energy) {
     k_energy_fast.clear();
     k_energy.clear();
@@ -283,20 +283,20 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
                    int &host_start, const double cpu_time,
                    bool &success, void **fp_ptr) {
   this->acc_timers();
-  
+
   if (this->device->time_device()) {
     // Put time from the second part to the total time_pair
     this->time_pair.add_time_to_total(time_pair2.time());
-    
+
     // Add transfer time from device -> host after part 1
     this->atom->add_transfer_time(time_fp1.time());
-    
+
     // Add transfer time from host -> device before part 2
     this->atom->add_transfer_time(time_fp2.time());
   }
-  
+
   // ------------------- Resize FP Array for EAM --------------------
-  
+
   if (nall>_max_fp_size) {
     _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
     _fp.resize(_max_fp_size);
@@ -313,7 +313,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
     this->zero_timers();
     return;
   }
-  
+
   int ago=this->hd_balancer.ago_first(f_ago);
   int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
   this->ans->inum(inum);
@@ -326,7 +326,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
     if (!success)
       return;
   }
-  
+
   this->atom->cast_x_data(host_x,host_type);
   this->atom->add_x_data(host_x,host_type);
 
@@ -345,36 +345,36 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int** EAMT::compute(const int ago, const int inum_full, const int nall,
-                    double **host_x, int *host_type, double *sublo, 
+                    double **host_x, int *host_type, double *sublo,
                     double *subhi, tagint *tag, int **nspecial, tagint **special,
                     const bool eflag, const bool vflag, const bool eatom,
                     const bool vatom, int &host_start, int **ilist, int **jnum,
-                    const double cpu_time, bool &success, int &inum, 
+                    const double cpu_time, bool &success, int &inum,
                     void **fp_ptr) {
   this->acc_timers();
-  
+
   if (this->device->time_device()) {
     // Put time from the second part to the total time_pair
     this->time_pair.add_time_to_total(time_pair2.time());
-    
+
     // Add transfer time from device -> host after part 1
     this->atom->add_transfer_time(time_fp1.time());
-    
+
     // Add transfer time from host -> device before part 2
     this->atom->add_transfer_time(time_fp2.time());
   }
 
   // ------------------- Resize FP Array for EAM --------------------
-  
+
   if (nall>_max_fp_size) {
     _max_fp_size=static_cast<int>(static_cast<double>(nall)*1.10);
     _fp.resize(_max_fp_size);
     fp_tex.bind_float(_fp,1);
-  }      
-  *fp_ptr=_fp.host.begin();  
+  }
+  *fp_ptr=_fp.host.begin();
 
   // -----------------------------------------------------------------
-  
+
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -382,14 +382,14 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
     this->zero_timers();
     return NULL;
   }
-  
+
   // load balance, returning the atom count on the device (inum)
   this->hd_balancer.balance(cpu_time);
   inum=this->hd_balancer.get_gpu_count(ago,inum_full);
   this->ans->inum(inum);
   host_start=inum;
- 
-  // Build neighbor list on GPU if necessary 
+
+  // Build neighbor list on GPU if necessary
   if (ago==0) {
     this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                           sublo, subhi, tag, nspecial, special, success);
@@ -403,14 +403,14 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
   *jnum=this->nbor->host_acc.begin();
 
   loop(eflag,vflag);
-  
+
   // copy fp from device to host for comm
   _nlocal=inum_full;
   time_fp1.start();
   _fp.update_host(inum_full,true);
   time_fp1.stop();
   time_fp1.sync_stop();
-  
+
   return this->nbor->host_jlist.begin()-host_start;
 }
 
@@ -420,20 +420,20 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
 template <class numtyp, class acctyp>
 void EAMT::compute2(int *ilist, const bool eflag, const bool vflag,
                     const bool eatom, const bool vatom) {
-  if (this->ans->inum()==0) 
+  if (this->ans->inum()==0)
     return;
-  
+
   this->hd_balancer.start_timer();
   time_fp2.start();
   this->add_fp_data();
   time_fp2.stop();
-  
+
   loop2(eflag,vflag);
   if (ilist == NULL)
     this->ans->copy_answers(eflag,vflag,eatom,vatom);
   else
     this->ans->copy_answers(eflag,vflag,eatom,vatom, ilist);
-  
+
   this->device->add_ans_object(this->ans);
   this->hd_balancer.stop_timer();
 }
@@ -455,27 +455,27 @@ void EAMT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
-  
+
   if (shared_types) {
     this->k_energy_fast.set_size(GX,BX);
     this->k_energy_fast.run(&this->atom->x, &type2rhor_z2r, &type2frho,
-                            &rhor_spline2, &frho_spline1,&frho_spline2, 
-                            &this->nbor->dev_nbor,  &this->_nbor_data->begin(), 
+                            &rhor_spline2, &frho_spline1,&frho_spline2,
+                            &this->nbor->dev_nbor,  &this->_nbor_data->begin(),
                             &_fp, &this->ans->engv, &eflag, &ainum,
                             &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho,
                             &_rhomax, &_nrho, &_nr, &this->_threads_per_atom);
   } else {
     this->k_energy.set_size(GX,BX);
     this->k_energy.run(&this->atom->x, &type2rhor_z2r, &type2frho,
-                       &rhor_spline2, &frho_spline1, &frho_spline2, 
-                       &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp, 
+                       &rhor_spline2, &frho_spline1, &frho_spline2,
+                       &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp,
                        &this->ans->engv,&eflag, &ainum, &nbor_pitch,
                        &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_rhomax, &_nrho,
                        &_nr, &this->_threads_per_atom);
@@ -501,25 +501,25 @@ void EAMT::loop2(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair2.start();
-  
+
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->x, &_fp, &type2rhor_z2r,
-                          &rhor_spline1, &z2r_spline1, &z2r_spline2, 
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                          &rhor_spline1, &z2r_spline1, &z2r_spline2,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &_cutforcesq, &_rdr,
                           &_nr, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &_fp, &type2rhor_z2r, &rhor_spline1, 
+    this->k_pair.run(&this->atom->x, &_fp, &type2rhor_z2r, &rhor_spline1,
                      &z2r_spline1, &z2r_spline2, &this->nbor->dev_nbor,
                      &this->_nbor_data->begin(), &this->ans->force,
                      &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch,
diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu
index 054b3ca6db..13440b7d45 100644
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -82,7 +82,7 @@ texture<int4> z2r_sp2_tex;
       engv[ii]=energy;                                                      \
     }                                                                       \
   }
-  
+
 #define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom,     \
                       offset, elag, vflag, ans, engv)                       \
   if (t_per_atom>1) {                                                       \
@@ -188,37 +188,37 @@ texture<int4> z2r_sp2_tex;
 
 #endif
 
-__kernel void k_energy(const __global numtyp4 *restrict x_, 
+__kernel void k_energy(const __global numtyp4 *restrict x_,
                        const __global int2 *restrict type2rhor_z2r,
-                       const __global int *restrict type2frho, 
-                       const __global numtyp4 *restrict rhor_spline2, 
+                       const __global int *restrict type2frho,
+                       const __global numtyp4 *restrict rhor_spline2,
                        const __global numtyp4 *restrict frho_spline1,
                        const __global numtyp4 *restrict frho_spline2,
-                       const __global int *dev_nbor, 
+                       const __global int *dev_nbor,
                        const __global int *dev_packed,
-                       __global numtyp *restrict fp_, 
-                       __global acctyp *restrict engv, 
+                       __global numtyp *restrict fp_,
+                       __global acctyp *restrict engv,
                        const int eflag, const int inum, const int nbor_pitch,
-                       const int ntypes,  const numtyp cutforcesq, 
-                       const numtyp rdr, const numtyp rdrho, 
+                       const int ntypes,  const numtyp cutforcesq,
+                       const numtyp rdr, const numtyp rdrho,
                        const numtyp rhomax, const int nrho,
                        const int nr, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   acctyp rho = (acctyp)0;
   acctyp energy = (acctyp)0;
-   
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
-    
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
@@ -231,60 +231,60 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-      
+
       if (rsq<cutforcesq) {
         numtyp p = ucl_sqrt(rsq)*rdr + (numtyp)1.0;
         int m=p;
         m = MIN(m,nr-1);
         p -= m;
         p = MIN(p,(numtyp)1.0);
-        
+
         int mtype = jtype*ntypes+itype;
         int index = type2rhor_z2r[mtype].x*(nr+1)+m;
         numtyp4 coeff; fetch4(coeff,index,rhor_sp2_tex);
         rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
       }
     } // for nbor
-    
+
     store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
         eflag,vflag,engv,rdrho,nrho,i,rhomax);
   } // if ii
 }
 
-__kernel void k_energy_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_energy_fast(const __global numtyp4 *restrict x_,
                             const __global int2 *restrict type2rhor_z2r_in,
-                            const __global int *restrict type2frho_in, 
-                            const __global numtyp4 *restrict rhor_spline2, 
+                            const __global int *restrict type2frho_in,
+                            const __global numtyp4 *restrict rhor_spline2,
                             const __global numtyp4 *restrict frho_spline1,
                             const __global numtyp4 *restrict frho_spline2,
-                            const __global int *dev_nbor, 
-                            const __global int *dev_packed, 
-                            __global numtyp *restrict fp_, 
-                            __global acctyp *restrict engv, 
-                            const int eflag,  const int inum, 
-                            const int nbor_pitch, const int ntypes, 
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            __global numtyp *restrict fp_,
+                            __global acctyp *restrict engv,
+                            const int eflag,  const int inum,
+                            const int nbor_pitch, const int ntypes,
                             const numtyp cutforcesq,  const numtyp rdr,
-                            const numtyp rdrho, const numtyp rhomax, 
-                            const int nrho, const int nr, 
+                            const numtyp rdrho, const numtyp rhomax,
+                            const int nrho, const int nr,
                             const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local int2 type2rhor_z2r[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local int type2frho[MAX_SHARED_TYPES];
 
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     type2rhor_z2r[tid]=type2rhor_z2r_in[tid];
   }
-  
+
   if (tid<MAX_SHARED_TYPES) {
     type2frho[tid]=type2frho_in[tid];
   }
 
   acctyp rho = (acctyp)0;
   acctyp energy = (acctyp)0;
-  
-  __syncthreads(); 
+
+  __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
@@ -292,10 +292,10 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
-    
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
@@ -307,14 +307,14 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-      
+
       if (rsq<cutforcesq) {
         numtyp p = ucl_sqrt(rsq)*rdr + (numtyp)1.0;
         int m=p;
         m = MIN(m,nr-1);
         p -= m;
         p = MIN(p,(numtyp)1.0);
-        
+
         int jtype=fast_mul((int)MAX_SHARED_TYPES,jx.w);
         int mtype = jtype+itype;
         int index = type2rhor_z2r[mtype].x*(nr+1)+m;
@@ -322,24 +322,24 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
         rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
       }
     } // for nbor
-    
+
     store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
                     eflag,vflag,engv,rdrho,nrho,i,rhomax);
   } // if ii
 }
 
-__kernel void k_eam(const __global numtyp4 *restrict x_, 
+__kernel void k_eam(const __global numtyp4 *restrict x_,
                     const __global numtyp *fp_,
                     const __global int2 *type2rhor_z2r,
-                    const __global numtyp4 *rhor_spline1, 
+                    const __global numtyp4 *rhor_spline1,
                     const __global numtyp4 *z2r_spline1,
                     const __global numtyp4 *z2r_spline2,
                     const __global int *dev_nbor,
-                    const __global int *dev_packed, 
+                    const __global int *dev_packed,
                     __global acctyp4 *ans,
-                    __global acctyp *engv, 
+                    __global acctyp *engv,
                     const int eflag, const int vflag,  const int inum,
-                    const int nbor_pitch, const int ntypes, 
+                    const int nbor_pitch, const int ntypes,
                     const numtyp cutforcesq,  const numtyp rdr, const int nr,
                     const int t_per_atom) {
   int tid, ii, offset;
@@ -353,14 +353,14 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp ifp; fetch(ifp,i,fp_tex);  //fp_[i];
     int itype=ix.w;
@@ -377,7 +377,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-      
+
       if (rsq<cutforcesq) {
         numtyp r = ucl_sqrt(rsq);
         numtyp p = r*rdr + (numtyp)1.0;
@@ -385,7 +385,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
         m = MIN(m,nr-1);
         p -= m;
         p = MIN(p,(numtyp)1.0);
-        
+
         int mtype,index;
         numtyp4 coeff;
 
@@ -398,22 +398,22 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
         fetch4(coeff,index,rhor_sp1_tex);
         numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z;
-              
+
         mtype = itype*ntypes+jtype;
         index = type2rhor_z2r[mtype].y*(nr+1)+m;
         fetch4(coeff,index,z2r_sp1_tex);
         numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z;
         fetch4(coeff,index,z2r_sp2_tex);
         numtyp z2 = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
-        
+
         numtyp recip = ucl_recip(r);
         numtyp phi = z2*recip;
         numtyp phip = z2p*recip - phi*recip;
         numtyp psip;
         fetch(psip,j,fp_tex);
-        psip = ifp*rhojp + psip*rhoip + phip; 
+        psip = ifp*rhojp + psip*rhoip + phip;
         numtyp force = -psip*recip;
-        
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -437,22 +437,22 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
 
 }
 
-__kernel void k_eam_fast(const __global numtyp4 *x_, 
+__kernel void k_eam_fast(const __global numtyp4 *x_,
                          const __global numtyp *fp_,
                          const __global int2 *type2rhor_z2r_in,
-                         const __global numtyp4 *rhor_spline1, 
+                         const __global numtyp4 *rhor_spline1,
                          const __global numtyp4 *z2r_spline1,
                          const __global numtyp4 *z2r_spline2,
-                         const __global int *dev_nbor, 
-                         const __global int *dev_packed, 
-                         __global acctyp4 *ans, 
-                         __global acctyp *engv, 
-                         const int eflag, const int vflag, const int inum, 
-                         const int nbor_pitch, const numtyp cutforcesq, 
+                         const __global int *dev_nbor,
+                         const __global int *dev_packed,
+                         __global acctyp4 *ans,
+                         __global acctyp *engv,
+                         const int eflag, const int vflag, const int inum,
+                         const int nbor_pitch, const numtyp cutforcesq,
                          const numtyp rdr, const int nr, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local int2 type2rhor_z2r[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
 
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@@ -487,13 +487,13 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jw=jx.w;
       int jtype=fast_mul((int)MAX_SHARED_TYPES,jw);
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<cutforcesq) {
         numtyp r = ucl_sqrt(rsq);
         numtyp p = r*rdr + (numtyp)1.0;
@@ -501,35 +501,35 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
         m = MIN(m,nr-1);
         p -= m;
         p = MIN(p,(numtyp)1.0);
-        
+
         numtyp4 coeff;
         int mtype,index;
-        
+
         mtype = itype+jw;
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
         fetch4(coeff,index,rhor_sp1_tex);
         numtyp rhoip = (coeff.x*p + coeff.y)*p + coeff.z;
-        
+
         mtype = jtype+iw;
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
         fetch4(coeff,index,rhor_sp1_tex);
         numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z;
-        
+
         mtype = itype+jw;
         index = type2rhor_z2r[mtype].y*(nr+1)+m;
         fetch4(coeff,index,z2r_sp1_tex);
         numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z;
         fetch4(coeff,index,z2r_sp2_tex);
         numtyp z2 = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
-      
+
         numtyp recip = ucl_recip(r);
         numtyp phi = z2*recip;
         numtyp phip = z2p*recip - phi*recip;
         numtyp psip;
         fetch(psip,j,fp_tex);
-        psip = ifp*rhojp + psip*rhoip + phip; 
+        psip = ifp*rhojp + psip*rhoip + phip;
         numtyp force = -psip*recip;
-        
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
diff --git a/lib/gpu/lal_eam.h b/lib/gpu/lal_eam.h
index 698f9938cb..ce26edc1f4 100644
--- a/lib/gpu/lal_eam.h
+++ b/lib/gpu/lal_eam.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -26,12 +26,12 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
  public:
   EAM();
   ~EAM();
-                  
+
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -41,11 +41,11 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
            int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline,
            double ***host_z2r_spline, double ***host_frho_spline, double rdr,
-           double rdrho, double rhomax, int nrhor, int nrho, int nz2r, 
-           int nfrho, int nr, const int nlocal, const int nall, 
+           double rdrho, double rhomax, int nrhor, int nrho, int nz2r,
+           int nfrho, int nr, const int nlocal, const int nall,
            const int max_nbors, const int maxspecial, const double cell_size,
            const double gpu_split, FILE *_screen);
-  
+
   // Copy charges to device asynchronously
   inline void add_fp_data() {
     int nghost=this->atom->nall()-_nlocal;
@@ -57,7 +57,7 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
       ucl_copy(dev_view,host_view,nghost,true);
     }
   }
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -67,7 +67,7 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
 
   /// Total host memory used by library for pair style
   double host_memory_usage() const;
-  
+
   /// Pair loop with host neighboring
   void compute(const int f_ago, const int inum_full, const int, const int nall,
                double **host_x, int *host_type, int *ilist, int *numj,
@@ -75,23 +75,23 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
                const bool eatom, const bool vatom, int &host_start,
                const double cpu_time, bool &success,
                void **fp_ptr);
-               
+
   /// Pair loop with device neighboring
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 int &inum, void **fp_ptr);
 
   /// Pair loop with host neighboring
-  void compute2(int *ilist, const bool eflag, const bool vflag, 
+  void compute2(int *ilist, const bool eflag, const bool vflag,
                 const bool eatom, const bool vatom);
-  
+
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Kernel k_energy, k_energy_fast;
-  
+
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture fp_tex;
   UCL_Texture rhor_spline1_tex, rhor_spline2_tex;
@@ -99,37 +99,37 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
   UCL_Texture z2r_spline1_tex, z2r_spline2_tex;
 
   // --------------------------- DEVICE DATA --------------------------
-  
+
   /// Device Timers
   UCL_Timer time_pair2, time_fp1, time_fp2;
-  
+
   // --------------------------- TYPE DATA --------------------------
-    
+
   UCL_D_Vec<int2> type2rhor_z2r;
   UCL_D_Vec<int> type2frho;
-  
+
   UCL_D_Vec<numtyp4> z2r_spline1, z2r_spline2;
   UCL_D_Vec<numtyp4> frho_spline1, frho_spline2;
   UCL_D_Vec<numtyp4> rhor_spline1, rhor_spline2;
-    
+
   numtyp _cutforcesq,_rdr,_rdrho, _rhomax;
-  
+
   int _nfrho,_nrhor,_nrho,_nz2r,_nr;
-  
+
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
-  
-  /// Number of atom types 
+
+  /// Number of atom types
   int _ntypes;
-  
+
   int _max_fp_size;
-  
+
   /// True of energy kernels are compiled
   bool _compiled_energy;
-  
+
   /// Per-atom arrays
   UCL_Vector<numtyp,numtyp> _fp;
-  
+
 protected:
   bool _allocated;
   int _nlocal;
diff --git a/lib/gpu/lal_eam_alloy_ext.cpp b/lib/gpu/lal_eam_alloy_ext.cpp
index 282f93afeb..9209ed5c26 100644
--- a/lib/gpu/lal_eam_alloy_ext.cpp
+++ b/lib/gpu/lal_eam_alloy_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -27,14 +27,14 @@ static EAM<PRECISION,ACC_PRECISION> EAMALMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq, 
+int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq,
                  int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
                  double ***host_rhor_spline, double ***host_z2r_spline,
                  double ***host_frho_spline,
-                 double rdr, double rdrho, double rhomax, int nrhor, 
-                 int nrho, int nz2r, int nfrho, int nr, 
-                 const int nlocal, const int nall, const int max_nbors, 
-                 const int maxspecial, const double cell_size, 
+                 double rdr, double rdrho, double rhomax, int nrhor,
+                 int nrho, int nz2r, int nfrho, int nr,
+                 const int nlocal, const int nall, const int max_nbors,
+                 const int maxspecial, const double cell_size,
                  int &gpu_mode, FILE *screen, int &fp_size) {
   EAMALMF.clear();
   gpu_mode=EAMALMF.device->gpu_mode();
@@ -46,11 +46,11 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq,
   int procs_per_gpu=EAMALMF.device->procs_per_gpu();
 
   // disable host/device split for now
-  if (gpu_split != 1.0) 
+  if (gpu_split != 1.0)
     return -8;
-    
+
   fp_size=sizeof(PRECISION);
-    
+
   EAMALMF.device->init_message(screen,"eam/alloy",first_gpu,last_gpu);
 
   bool message=false;
@@ -66,7 +66,7 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq,
   if (world_me==0)
     init_ok=EAMALMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                        host_type2frho, host_rhor_spline, host_z2r_spline,
-                       host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, 
+                       host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r,
                        nfrho, nr, nlocal, nall, 300, maxspecial, cell_size,
                        gpu_split, screen);
 
@@ -86,12 +86,12 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq,
     if (gpu_rank==i && world_me!=0)
       init_ok=EAMALMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                          host_type2frho, host_rhor_spline, host_z2r_spline,
-                         host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, 
+                         host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho,
                          nz2r, nfrho, nr, nlocal, nall, 300, maxspecial,
                          cell_size, gpu_split, screen);
 
     EAMALMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -108,7 +108,7 @@ void eam_alloy_gpu_clear() {
 
 int ** eam_alloy_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
@@ -117,10 +117,10 @@ int ** eam_alloy_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        inum, fp_ptr);
-}  
+}
 
-void eam_alloy_gpu_compute(const int ago, const int inum_full, const int nlocal, 
-                     const int nall, double **host_x, int *host_type, 
+void eam_alloy_gpu_compute(const int ago, const int inum_full, const int nlocal,
+                     const int nall, double **host_x, int *host_type,
                      int *ilist, int *numj, int **firstneigh, const bool eflag,
                      const bool vflag, const bool eatom, const bool vatom,
                      int &host_start, const double cpu_time, bool &success,
diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp
index d56f750e2f..1b5602f808 100644
--- a/lib/gpu/lal_eam_ext.cpp
+++ b/lib/gpu/lal_eam_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -27,14 +27,14 @@ static EAM<PRECISION,ACC_PRECISION> EAMMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int eam_gpu_init(const int ntypes, double host_cutforcesq, 
+int eam_gpu_init(const int ntypes, double host_cutforcesq,
                  int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
                  double ***host_rhor_spline, double ***host_z2r_spline,
                  double ***host_frho_spline,
-                 double rdr, double rdrho, double rhomax, int nrhor, 
-                 int nrho, int nz2r, int nfrho, int nr, 
-                 const int nlocal, const int nall, const int max_nbors, 
-                 const int maxspecial, const double cell_size, 
+                 double rdr, double rdrho, double rhomax, int nrhor,
+                 int nrho, int nz2r, int nfrho, int nr,
+                 const int nlocal, const int nall, const int max_nbors,
+                 const int maxspecial, const double cell_size,
                  int &gpu_mode, FILE *screen, int &fp_size) {
   EAMMF.clear();
   gpu_mode=EAMMF.device->gpu_mode();
@@ -46,11 +46,11 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
   int procs_per_gpu=EAMMF.device->procs_per_gpu();
 
   // disable host/device split for now
-  if (gpu_split != 1.0) 
+  if (gpu_split != 1.0)
     return -8;
-    
+
   fp_size=sizeof(PRECISION);
-    
+
   EAMMF.device->init_message(screen,"eam",first_gpu,last_gpu);
 
   bool message=false;
@@ -66,7 +66,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
   if (world_me==0)
     init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                        host_type2frho, host_rhor_spline, host_z2r_spline,
-                       host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, 
+                       host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r,
                        nfrho, nr, nlocal, nall, 300, maxspecial, cell_size,
                        gpu_split, screen);
 
@@ -86,12 +86,12 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
     if (gpu_rank==i && world_me!=0)
       init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                          host_type2frho, host_rhor_spline, host_z2r_spline,
-                         host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, 
+                         host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho,
                          nz2r, nfrho, nr, nlocal, nall, 300, maxspecial,
                          cell_size, gpu_split, screen);
 
     EAMMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -108,7 +108,7 @@ void eam_gpu_clear() {
 
 int ** eam_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
@@ -117,10 +117,10 @@ int ** eam_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        inum, fp_ptr);
-}  
+}
 
-void eam_gpu_compute(const int ago, const int inum_full, const int nlocal, 
-                     const int nall, double **host_x, int *host_type, 
+void eam_gpu_compute(const int ago, const int inum_full, const int nlocal,
+                     const int nall, double **host_x, int *host_type,
                      int *ilist, int *numj, int **firstneigh, const bool eflag,
                      const bool vflag, const bool eatom, const bool vatom,
                      int &host_start, const double cpu_time, bool &success,
diff --git a/lib/gpu/lal_eam_fs_ext.cpp b/lib/gpu/lal_eam_fs_ext.cpp
index 4992f3ab98..b9e25466aa 100644
--- a/lib/gpu/lal_eam_fs_ext.cpp
+++ b/lib/gpu/lal_eam_fs_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -27,14 +27,14 @@ static EAM<PRECISION,ACC_PRECISION> EAMFSMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int eam_fs_gpu_init(const int ntypes, double host_cutforcesq, 
+int eam_fs_gpu_init(const int ntypes, double host_cutforcesq,
                  int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
                  double ***host_rhor_spline, double ***host_z2r_spline,
                  double ***host_frho_spline,
-                 double rdr, double rdrho, double rhomax, int nrhor, 
-                 int nrho, int nz2r, int nfrho, int nr, 
-                 const int nlocal, const int nall, const int max_nbors, 
-                 const int maxspecial, const double cell_size, 
+                 double rdr, double rdrho, double rhomax, int nrhor,
+                 int nrho, int nz2r, int nfrho, int nr,
+                 const int nlocal, const int nall, const int max_nbors,
+                 const int maxspecial, const double cell_size,
                  int &gpu_mode, FILE *screen, int &fp_size) {
   EAMFSMF.clear();
   gpu_mode=EAMFSMF.device->gpu_mode();
@@ -46,11 +46,11 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq,
   int procs_per_gpu=EAMFSMF.device->procs_per_gpu();
 
   // disable host/device split for now
-  if (gpu_split != 1.0) 
+  if (gpu_split != 1.0)
     return -8;
-    
+
   fp_size=sizeof(PRECISION);
-    
+
   EAMFSMF.device->init_message(screen,"eam/fs",first_gpu,last_gpu);
 
   bool message=false;
@@ -66,7 +66,7 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq,
   if (world_me==0)
     init_ok=EAMFSMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                        host_type2frho, host_rhor_spline, host_z2r_spline,
-                       host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, 
+                       host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r,
                        nfrho, nr, nlocal, nall, 300, maxspecial, cell_size,
                        gpu_split, screen);
 
@@ -86,12 +86,12 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq,
     if (gpu_rank==i && world_me!=0)
       init_ok=EAMFSMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                          host_type2frho, host_rhor_spline, host_z2r_spline,
-                         host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, 
+                         host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho,
                          nz2r, nfrho, nr, nlocal, nall, 300, maxspecial,
                          cell_size, gpu_split, screen);
 
     EAMFSMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -108,7 +108,7 @@ void eam_fs_gpu_clear() {
 
 int ** eam_fs_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
@@ -117,10 +117,10 @@ int ** eam_fs_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        inum, fp_ptr);
-}  
+}
 
-void eam_fs_gpu_compute(const int ago, const int inum_full, const int nlocal, 
-                     const int nall, double **host_x, int *host_type, 
+void eam_fs_gpu_compute(const int ago, const int inum_full, const int nlocal,
+                     const int nall, double **host_x, int *host_type,
                      int *ilist, int *numj, int **firstneigh, const bool eflag,
                      const bool vflag, const bool eatom, const bool vatom,
                      int &host_start, const double cpu_time, bool &success,
diff --git a/lib/gpu/lal_ellipsoid_extra.h b/lib/gpu/lal_ellipsoid_extra.h
index b33f087212..71668f5e02 100644
--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@@ -245,8 +245,8 @@ ucl_inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans)
 
 ucl_inline numtyp gpu_det3(const numtyp m[9])
 {
-  numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - 
-    m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + 
+  numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] -
+    m[3]*m[1]*m[8] + m[3]*m[2]*m[7] +
     m[6]*m[1]*m[5] - m[6]*m[2]*m[4];
   return ans;
 };
@@ -255,7 +255,7 @@ ucl_inline numtyp gpu_det3(const numtyp m[9])
    diagonal matrix times a full matrix
 ------------------------------------------------------------------------- */
 
-ucl_inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9], 
+ucl_inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9],
                               numtyp ans[9])
 {
   ans[0] = shape.x*m[0];
@@ -421,7 +421,7 @@ ucl_inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
   t = aug[9]/aug[5];
   aug[10]-=t*aug[6];
   aug[11]-=t*aug[7];
-  
+
   if (aug[10] == (numtyp)0.0)
     *error_flag=2;
 
@@ -440,11 +440,11 @@ ucl_inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
    quat = [w i j k]
 ------------------------------------------------------------------------- */
 
-ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, 
+ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi,
                                     numtyp mat[9])
 {
   numtyp4 q; fetch4(q,qi,quat_tex);
-  
+
   numtyp w2 = q.x*q.x;
   numtyp i2 = q.y*q.y;
   numtyp j2 = q.z*q.z;
@@ -463,7 +463,7 @@ ucl_inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi,
   mat[1] = twoij+twokw;
   mat[4] = w2-i2+j2-k2;
   mat[7] = twojk-twoiw;
-	
+
   mat[2] = twoik-twojw;
   mat[5] = twojk+twoiw;
   mat[8] = w2-i2-j2+k2;
@@ -561,7 +561,7 @@ ucl_inline void gpu_rotation_generator_z(const numtyp m[9], numtyp ans[9])
 ------------------------------------------------------------------------- */
 
 ucl_inline void gpu_times_column3(const numtyp m[9], const numtyp v[3],
-                                numtyp ans[3]) 
+                                numtyp ans[3])
 {
   ans[0] = m[0]*v[0] + m[1]*v[1] + m[2]*v[2];
   ans[1] = m[3]*v[0] + m[4]*v[1] + m[5]*v[2];
diff --git a/lib/gpu/lal_ellipsoid_nbor.cu b/lib/gpu/lal_ellipsoid_nbor.cu
index 30d864aecc..cac77f5dd3 100644
--- a/lib/gpu/lal_ellipsoid_nbor.cu
+++ b/lib/gpu/lal_ellipsoid_nbor.cu
@@ -29,14 +29,14 @@ texture<int4,1> pos_tex;
 // -- Only unpack neighbors matching the specified inclusive range of forms
 // -- Only unpack neighbors within cutoff
 // ---------------------------------------------------------------------------
-__kernel void kernel_nbor(const __global numtyp4 *restrict x_, 
-                          const __global numtyp2 *restrict cut_form, 
-                          const int ntypes, 
+__kernel void kernel_nbor(const __global numtyp4 *restrict x_,
+                          const __global numtyp2 *restrict cut_form,
+                          const int ntypes,
                           __global int *dev_nbor,
-                          const int nbor_pitch, const int start, const int inum, 
-                          const __global int *dev_ij, 
+                          const int nbor_pitch, const int start, const int inum,
+                          const __global int *dev_ij,
                           const int form_low, const int form_high) {
-                                
+
   // ii indexes the two interacting particles in gi
   int ii=GLOBAL_ID_X+start;
 
@@ -47,11 +47,11 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
     nbor+=nbor_pitch;
     int nbor_end=nbor+fast_mul(numj,nbor_pitch);
     int packed=ii+nbor_pitch+nbor_pitch;
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul(iw,ntypes);
-    int newj=0;  
+    int newj=0;
     for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
       int j=dev_ij[nbor];
       j &= NEIGHMASK;
@@ -85,14 +85,14 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
 // -- Only unpack neighbors within cutoff
 // -- Fast version of routine that uses shared memory for LJ constants
 // ---------------------------------------------------------------------------
-__kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_, 
+__kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp2 *restrict cut_form,
-                               __global int *dev_nbor, 
-                               const int nbor_pitch, const int start, 
-                               const int inum, 
-                               const __global int *dev_ij, 
+                               __global int *dev_nbor,
+                               const int nbor_pitch, const int start,
+                               const int inum,
+                               const __global int *dev_ij,
                                const int form_low, const int form_high) {
-                                
+
   int ii=THREAD_ID_X;
   __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@@ -110,19 +110,19 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
     nbor+=nbor_pitch;
     int nbor_end=nbor+fast_mul(numj,nbor_pitch);
     int packed=ii+nbor_pitch+nbor_pitch;
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    int newj=0;  
+    int newj=0;
     for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
       int j=dev_ij[nbor];
       j &= NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
       int mtype=itype+jtype;
-      
+
       if (form[mtype]>=form_low && form[mtype]<=form_high) {
         // Compute r12;
         numtyp rsq=jx.x-ix.x;
diff --git a/lib/gpu/lal_gauss.cpp b/lib/gpu/lal_gauss.cpp
index 342ec4ecda..1ef215d7ff 100644
--- a/lib/gpu/lal_gauss.cpp
+++ b/lib/gpu/lal_gauss.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,19 +33,19 @@ GaussT::Gauss() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-GaussT::~Gauss() { 
+GaussT::~Gauss() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int GaussT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int GaussT::init(const int ntypes, 
-                 double **host_cutsq, double **host_a, 
-                 double **host_b, double **host_offset, 
+int GaussT::init(const int ntypes,
+                 double **host_cutsq, double **host_a,
+                 double **host_b, double **host_offset,
                  double *host_special_lj, const int nlocal,
                  const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size,
@@ -75,7 +75,7 @@ int GaussT::init(const int ntypes,
 
   gauss1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,gauss1,host_write,host_a,host_b,
-			                   host_cutsq,host_offset);
+                                           host_cutsq,host_offset);
 
   UCL_H_Vec<double> dview;
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@@ -94,12 +94,12 @@ void GaussT::reinit(const int ntypes, double **host_cutsq, double **host_a,
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack4(ntypes,_lj_types,gauss1,host_write,host_a,host_b,
-			                   host_cutsq,host_offset);
+                                           host_cutsq,host_offset);
 }
 
 template <class numtyp, class acctyp>
@@ -135,7 +135,7 @@ void GaussT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
diff --git a/lib/gpu/lal_gauss.cu b/lib/gpu/lal_gauss.cu
index 6accf36a06..98e71ea413 100644
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -24,14 +24,14 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_gauss(const __global numtyp4 *restrict x_, 
+__kernel void k_gauss(const __global numtyp4 *restrict x_,
                       const __global numtyp4 *restrict gauss1,
-                      const int lj_types, 
-                      const __global numtyp *restrict sp_lj_in, 
-                      const __global int *dev_nbor, 
-                      const __global int *dev_packed, 
+                      const int lj_types,
+                      const __global numtyp *restrict sp_lj_in,
+                      const __global int *dev_nbor,
+                      const __global int *dev_packed,
                       __global acctyp4 *restrict ans,
-                      __global acctyp *restrict engv, 
+                      __global acctyp *restrict engv,
                       const int eflag, const int vflag, const int inum,
                       const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -49,20 +49,20 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -75,22 +75,22 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<gauss1[mtype].z) {
         numtyp r2inv = ucl_recip(rsq);
         numtyp r = ucl_sqrt(rsq);
-        numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq* 
-        ucl_exp(-gauss1[mtype].y*rsq)*r2inv*factor_lj; 
+        numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq*
+        ucl_exp(-gauss1[mtype].y*rsq)*r2inv*factor_lj;
 
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
-          numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - 
+          numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) -
             gauss1[mtype].w);
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -108,18 +108,18 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_gauss_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp4 *restrict gauss1_in,
-                           const __global numtyp *restrict sp_lj_in, 
+                           const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
-                           const __global int *dev_packed, 
+                           const __global int *dev_packed,
                            __global acctyp4 *restrict ans,
-                           __global acctyp *restrict engv, 
-                           const int eflag, const int vflag, const int inum, 
+                           __global acctyp *restrict engv,
+                           const int eflag, const int vflag, const int inum,
                            const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 gauss1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -127,7 +127,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     gauss1[tid]=gauss1_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -136,7 +136,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -150,7 +150,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -163,21 +163,21 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<gauss1[mtype].z) {
         numtyp r2inv = ucl_recip(rsq);
         numtyp r = ucl_sqrt(rsq);
-        numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq* 
-        ucl_exp(-gauss1[mtype].y*rsq)*r2inv*factor_lj; 
-      
+        numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq*
+        ucl_exp(-gauss1[mtype].y*rsq)*r2inv*factor_lj;
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
-          numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - 
+          numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) -
             gauss1[mtype].w);
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_gauss.h b/lib/gpu/lal_gauss.h
index 1fd58adae5..d023310c6d 100644
--- a/lib/gpu/lal_gauss.h
+++ b/lib/gpu/lal_gauss.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Gauss : public BaseAtomic<numtyp, acctyp> {
  public:
   Gauss();
-  ~Gauss(); 
+  ~Gauss();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -38,16 +38,16 @@ class Gauss : public BaseAtomic<numtyp, acctyp> {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq,
-           double **host_a, double **host_b, double **host_offset, 
+           double **host_a, double **host_b, double **host_offset,
            double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
-  
+
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **host_cutsq,
               double **host_a, double **host_b, double **host_offset);
-           
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -68,7 +68,7 @@ class Gauss : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_gauss_ext.cpp b/lib/gpu/lal_gauss_ext.cpp
index 7c15a12591..7fa4b68870 100644
--- a/lib/gpu/lal_gauss_ext.cpp
+++ b/lib/gpu/lal_gauss_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -27,9 +27,9 @@ static Gauss<PRECISION,ACC_PRECISION> GLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, 
-                   double **host_b, double **offset, double *special_lj, 
-                   const int inum, const int nall, const int max_nbors,  
+int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
+                   double **host_b, double **offset, double *special_lj,
+                   const int inum, const int nall, const int max_nbors,
                    const int maxspecial,
                    const double cell_size, int &gpu_mode, FILE *screen) {
   GLMF.clear();
@@ -54,7 +54,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=GLMF.init(ntypes, cutsq, host_a, host_b, 
+    init_ok=GLMF.init(ntypes, cutsq, host_a, host_b,
                        offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
@@ -77,7 +77,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
                         cell_size, gpu_split, screen);
 
     GLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -96,16 +96,16 @@ void gauss_gpu_reinit(const int ntypes, double **cutsq, double **host_a,
   int world_me=GLMF.device->world_me();
   int gpu_rank=GLMF.device->gpu_rank();
   int procs_per_gpu=GLMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     GLMF.reinit(ntypes, cutsq, host_a, host_b, offset);
-  
+
   GLMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       GLMF.reinit(ntypes, cutsq, host_a, host_b, offset);
-    
+
     GLMF.device->gpu_barrier();
   }
 }
@@ -124,8 +124,8 @@ int ** gauss_gpu_compute_n(const int ago, const int inum_full,
   return GLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void gauss_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_gayberne.cpp b/lib/gpu/lal_gayberne.cpp
index 1d38810ae8..ba15af672e 100644
--- a/lib/gpu/lal_gayberne.cpp
+++ b/lib/gpu/lal_gayberne.cpp
@@ -37,21 +37,21 @@ GayBerneT::GayBerne() : BaseEllipsoid<numtyp,acctyp>(),
 }
 
 template <class numtyp, class acctyp>
-GayBerneT::~GayBerne() { 
+GayBerneT::~GayBerne() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int GayBerneT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int GayBerneT::init(const int ntypes, const double gamma, 
-                         const double upsilon, const double mu, 
-                         double **host_shape, double **host_well, 
-                         double **host_cutsq, double **host_sigma, 
-                         double **host_epsilon, double *host_lshape, 
+int GayBerneT::init(const int ntypes, const double gamma,
+                         const double upsilon, const double mu,
+                         double **host_shape, double **host_well,
+                         double **host_cutsq, double **host_sigma,
+                         double **host_epsilon, double *host_lshape,
                          int **h_form, double **host_lj1, double **host_lj2,
                          double **host_lj3, double **host_lj4,
                          double **host_offset, const double *host_special_lj,
@@ -84,27 +84,27 @@ int GayBerneT::init(const int ntypes, const double gamma,
 
   sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
-			 host_sigma,host_epsilon);
+                         host_sigma,host_epsilon);
 
   this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write,
-			 host_cutsq,h_form);
+                         host_cutsq,h_form);
 
   lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq,h_form);
+                         host_cutsq,h_form);
 
   lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);
 
   dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
   dev_error.zero();
-    
+
   // Allocate, cast and asynchronous memcpy of constant data
   // Copy data for bonded interactions
   gamma_upsilon_mu.alloc(7,*(this->ucl_device),UCL_READ_ONLY);
-  host_write[0]=static_cast<numtyp>(gamma); 
+  host_write[0]=static_cast<numtyp>(gamma);
   host_write[1]=static_cast<numtyp>(upsilon);
   host_write[2]=static_cast<numtyp>(mu);
   host_write[3]=static_cast<numtyp>(host_special_lj[0]);
@@ -117,7 +117,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
   UCL_H_Vec<double> d_view;
   d_view.view(host_lshape,lshape.numel(),*(this->ucl_device));
   ucl_copy(lshape,d_view,false);
-    
+
   // Copy shape, well, sigma, epsilon, and cutsq onto GPU
   // - cast if necessary
   shape.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
@@ -138,7 +138,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
   }
   view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device));
   ucl_copy(well,view4,false);
-  
+
   _allocated=true;
   this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+
                    lj1.row_bytes()+lj3.row_bytes()+gamma_upsilon_mu.row_bytes()+
@@ -155,7 +155,7 @@ void GayBerneT::clear() {
   UCL_H_Vec<int> err_flag(1,*(this->ucl_device));
   ucl_copy(err_flag,dev_error,false);
   if (err_flag[0] == 2)
-    std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";  
+    std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";
   err_flag.clear();
 
   _allocated=false;
@@ -170,7 +170,7 @@ void GayBerneT::clear() {
   well.clear();
   lshape.clear();
   gamma_upsilon_mu.clear();
-  
+
   this->clear_base();
 }
 
@@ -196,7 +196,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=0, NGX;
   int stride=this->nbor->nbor_pitch();
   int ainum=this->ans->inum();
@@ -209,17 +209,17 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
                                (BX/this->_threads_per_atom)));
       NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX));
       this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
-			                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
+                                         ELLIPSE_ELLIPSE,_shared_types,_lj_types);
       this->time_nbor1.stop();
 
       this->time_ellipsoid.start();
       this->k_ellipsoid.set_size(GX,BX);
-      this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, 
+      this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
                             &this->shape, &this->well, &this->gamma_upsilon_mu,
-                            &this->sigma_epsilon, &this->_lj_types, 
-                            &this->lshape, &this->nbor->dev_nbor, &stride, 
+                            &this->sigma_epsilon, &this->_lj_types,
+                            &this->lshape, &this->nbor->dev_nbor, &stride,
                             &this->ans->force, &ainum, &this->ans->engv,
-                            &this->dev_error, &eflag, &vflag, 
+                            &this->dev_error, &eflag, &vflag,
                             &this->_last_ellipse, &this->_threads_per_atom);
       this->time_ellipsoid.stop();
 
@@ -242,18 +242,18 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
       NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
                                 this->_last_ellipse)/BX));
       this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(),
-			                 SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
+                                         SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
       this->time_nbor2.stop();
 
       this->time_ellipsoid2.start();
       this->k_sphere_ellipsoid.set_size(GX,BX);
       this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
-                                   &this->shape,  &this->well, 
-                                   &this->gamma_upsilon_mu, 
-                                   &this->sigma_epsilon, &this->_lj_types, 
-                                   &this->lshape,  &this->nbor->dev_nbor, 
-                                   &stride, &this->ans->force, 
-                                   &this->ans->engv, &this->dev_error, 
+                                   &this->shape,  &this->well,
+                                   &this->gamma_upsilon_mu,
+                                   &this->sigma_epsilon, &this->_lj_types,
+                                   &this->lshape,  &this->nbor->dev_nbor,
+                                   &stride, &this->ans->force,
+                                   &this->ans->engv, &this->dev_error,
                                    &eflag, &vflag, &this->_last_ellipse,
                                    &ainum, &this->_threads_per_atom);
       this->time_ellipsoid2.stop();
@@ -264,28 +264,28 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
       this->ans->force.zero();
       this->ans->engv.zero();
       this->time_nbor1.stop();
-      this->time_ellipsoid.start();                                 
+      this->time_ellipsoid.start();
       this->time_ellipsoid.stop();
       this->time_nbor2.start();
       this->time_nbor2.stop();
       this->time_ellipsoid2.start();
       this->time_ellipsoid2.stop();
     }
-    
+
     // ------------         LJ      ---------------
     this->time_lj.start();
     if (this->_last_ellipse<this->ans->inum()) {
       if (this->_shared_types) {
         this->k_lj_fast.set_size(GX,BX);
-        this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3, 
-                            &this->gamma_upsilon_mu, &stride, 
+        this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3,
+                            &this->gamma_upsilon_mu, &stride,
                             &this->nbor->dev_packed, &this->ans->force,
-                            &this->ans->engv, &this->dev_error, &eflag, 
+                            &this->ans->engv, &this->dev_error, &eflag,
                             &vflag, &this->_last_ellipse, &ainum,
                             &this->_threads_per_atom);
       } else {
         this->k_lj.set_size(GX,BX);
-        this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3, 
+        this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3,
                        &this->_lj_types, &this->gamma_upsilon_mu, &stride,
                        &this->nbor->dev_packed, &this->ans->force,
                        &this->ans->engv, &this->dev_error, &eflag,
@@ -300,12 +300,12 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
     NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
     this->time_nbor1.start();
     this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
-		                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
+                                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
     this->time_nbor1.stop();
-    this->time_ellipsoid.start(); 
+    this->time_ellipsoid.start();
     this->k_ellipsoid.set_size(GX,BX);
-    this->k_ellipsoid.run(&this->atom->x,  &this->atom->quat, 
-                          &this->shape, &this->well, &this->gamma_upsilon_mu, 
+    this->k_ellipsoid.run(&this->atom->x,  &this->atom->quat,
+                          &this->shape, &this->well, &this->gamma_upsilon_mu,
                           &this->sigma_epsilon, &this->_lj_types, &this->lshape,
                           &this->nbor->dev_nbor, &stride, &this->ans->force,
                           &ainum,  &this->ans->engv, &this->dev_error,
diff --git a/lib/gpu/lal_gayberne.cu b/lib/gpu/lal_gayberne.cu
index 1a7e69eeba..dc6e00ec82 100644
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@@ -17,93 +17,93 @@
 #include "lal_ellipsoid_extra.h"
 #endif
 
-ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, 
+ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape,
                                    numtyp ans[9])
 {
   numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
     m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
     m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
   den = ucl_recip(den);
-  
+
   ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
-		    m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
-		    m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
-		    m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
-		    m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
-  
+                    m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
+                    m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
+                    m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
+                    m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
+
   ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
-		    (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
-		    (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
-		    m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
-		    m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
-  
+                    (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
+                    (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
+                    m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
+                    m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
+
   ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
-		    m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
-		    m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
-		    (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
-		    m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
-  
+                    m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
+                    m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
+                    (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
+                    m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
+
   ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
-		    m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
-		    m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
-		    m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
-		    m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
-  
+                    m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
+                    m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
+                    m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
+                    m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
+
   ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
-		    (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
-		    (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
-		    m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
-		    m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
-  
+                    (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
+                    (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
+                    m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
+                    m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
+
   ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
-		    m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
-		    (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
-		    m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
-		    (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
-  
+                    m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
+                    (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
+                    m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
+                    (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
+
   ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
-		    (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
-		    m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
-		    m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
-		    m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
-  
+                    (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
+                    m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
+                    m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
+                    m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
+
   ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
-		     (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
-		     (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
-		     m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
-		     m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
-  
+                     (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
+                     (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
+                     m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
+                     m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
+
   ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
-		    m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
-		    m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
-		    (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
-		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
+                    m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
+                    m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
+                    (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
+                    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
 }
 
 __kernel void k_gayberne(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict q,
-                         const __global numtyp4 *restrict shape, 
-                         const __global numtyp4 *restrict well, 
-                         const __global numtyp *restrict gum, 
-                         const __global numtyp2 *restrict sig_eps, 
-                         const int ntypes, 
-                         const __global numtyp *restrict lshape, 
-                         const __global int *dev_nbor, 
-                         const int stride, 
-                         __global acctyp4 *restrict ans, 
-                         const int astride, 
-                         __global acctyp *restrict engv, 
-                         __global int *restrict err_flag, 
+                         const __global numtyp4 *restrict shape,
+                         const __global numtyp4 *restrict well,
+                         const __global numtyp *restrict gum,
+                         const __global numtyp2 *restrict sig_eps,
+                         const int ntypes,
+                         const __global numtyp *restrict lshape,
+                         const __global int *dev_nbor,
+                         const int stride,
+                         __global acctyp4 *restrict ans,
+                         const int astride,
+                         __global acctyp *restrict engv,
+                         __global int *restrict err_flag,
                          const int eflag, const int vflag, const int inum,
                          const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
-  sp_lj[0]=gum[3];    
-  sp_lj[1]=gum[4];    
-  sp_lj[2]=gum[5];    
-  sp_lj[3]=gum[6];    
+  sp_lj[0]=gum[3];
+  sp_lj[1]=gum[4];
+  sp_lj[2]=gum[5];
+  sp_lj[3]=gum[6];
 
   acctyp energy=(acctyp)0;
   acctyp4 f;
@@ -124,7 +124,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
     numtyp a1[9], b1[9], g1[9];
@@ -159,7 +159,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
 
       numtyp a2[9];
       gpu_quat_to_mat_trans(q,j,a2);
-  
+
       numtyp u_r, dUr[3], tUr[3], eta, teta[3];
       { // Compute U_r, dUr, eta, and teta
         // Compute g12
@@ -173,7 +173,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
           }
 
           { // Compute U_r and dUr
-    
+
             // Compute kappa
             numtyp kappa[3];
             gpu_mldivide3(g12,r12,kappa,err_flag);
@@ -189,7 +189,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
             kappa[2]*=ir;
 
             // energy
-  
+
             // compute u_r and dUr
             numtyp uslj_rsq;
             {
@@ -203,7 +203,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
               kappa[0]*=r;
               kappa[1]*=r;
               kappa[2]*=r;
-          
+
               int mtype=fast_mul(ntypes,itype)+jtype;
               numtyp sigma = sig_eps[mtype].x;
               numtyp epsilon = sig_eps[mtype].y;
@@ -235,14 +235,14 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
             }
           }
         }
-     
+
         // Compute eta
         {
           eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
           numtyp det_g12 = gpu_det3(g12);
           eta = ucl_powr(eta/det_g12,gum[1]);
         }
-    
+
         // Compute teta
         numtyp temp[9], tempv[3], tempv2[3];
         compute_eta_torque(g12,a1,ishape,temp);
@@ -255,7 +255,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
         teta[0] = tempv2[0];
         teta[1] = tempv2[1];
         teta[2] = tempv2[2];
-  
+
         tempv[0] = temp1*temp[3];
         tempv[1] = temp1*temp[4];
         tempv[2] = temp1*temp[5];
@@ -272,7 +272,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
         teta[1] += tempv2[1];
         teta[2] += tempv2[2];
       }
-  
+
       numtyp chi, dchi[3], tchi[3];
       { // Compute chi and dchi
 
@@ -355,7 +355,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
       tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
       tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
       tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
- 
+
     } // for nbor
     store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
                     vflag,ans,engv);
diff --git a/lib/gpu/lal_gayberne.h b/lib/gpu/lal_gayberne.h
index dacaf74282..8792f1f1db 100644
--- a/lib/gpu/lal_gayberne.h
+++ b/lib/gpu/lal_gayberne.h
@@ -25,14 +25,14 @@ template <class numtyp, class acctyp>
 class GayBerne : public BaseEllipsoid<numtyp, acctyp> {
  public:
   GayBerne();
-  ~GayBerne(); 
+  ~GayBerne();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device 
+    * \param gpu_split fraction of particles handled by device
     * \return false if there is not sufficient memory or device init prob
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -41,18 +41,18 @@ class GayBerne : public BaseEllipsoid<numtyp, acctyp> {
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, const double gamma,
            const double upsilon, const double mu, double **host_shape,
-           double **host_well, double **host_cutsq, double **host_sigma, 
+           double **host_well, double **host_cutsq, double **host_sigma,
            double **host_epsilon, double *host_lshape, int **h_form,
-           double **host_lj1, double **host_lj2, double **host_lj3, 
-           double **host_lj4, double **host_offset, 
-           const double *host_special_lj, const int nlocal, const int nall, 
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset,
+           const double *host_special_lj, const int nlocal, const int nall,
            const int max_nbors, const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
- 
+
   /// Returns memory usage on device per atom
   int bytes_per_atom(const int max_nbors) const;
 
@@ -61,8 +61,8 @@ class GayBerne : public BaseEllipsoid<numtyp, acctyp> {
 
   /// Device Error Flag - Set if a bad matrix inversion occurs
   UCL_D_Vec<int> dev_error;
-  
-  // --------------------------- TYPE DATA -------------------------- 
+
+  // --------------------------- TYPE DATA --------------------------
 
   /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form
   UCL_D_Vec<numtyp4> lj1;
@@ -72,12 +72,12 @@ class GayBerne : public BaseEllipsoid<numtyp, acctyp> {
   UCL_D_Vec<numtyp2> sigma_epsilon;
   // 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ...
   UCL_D_Vec<numtyp> gamma_upsilon_mu;
-  
+
   /// If atom type constants fit in shared memory, use fast kernels
   bool _shared_types;
   int _lj_types;
-   
-  // --------------------------- ATOM DATA -------------------------- 
+
+  // --------------------------- ATOM DATA --------------------------
 
   /// Aspherical Const Data for Atoms
   UCL_D_Vec<numtyp4> shape, well;
diff --git a/lib/gpu/lal_gayberne_ext.cpp b/lib/gpu/lal_gayberne_ext.cpp
index e674fb376b..451550e7ef 100644
--- a/lib/gpu/lal_gayberne_ext.cpp
+++ b/lib/gpu/lal_gayberne_ext.cpp
@@ -33,7 +33,7 @@ int gb_gpu_init(const int ntypes, const double gamma,
                 double **epsilon, double *host_lshape, int **form,
                 double **host_lj1, double **host_lj2, double **host_lj3,
                 double **host_lj4, double **offset, double *special_lj,
-                const int inum, const int nall, const int max_nbors, 
+                const int inum, const int nall, const int max_nbors,
                 const int maxspecial, const double cell_size, int &gpu_mode,
                 FILE *screen) {
   GBMF.clear();
@@ -58,16 +58,16 @@ int gb_gpu_init(const int ntypes, const double gamma,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
-                      sigma, epsilon, host_lshape, form, host_lj1, 
-                      host_lj2, host_lj3, host_lj4, offset, special_lj, 
+    init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,
+                      sigma, epsilon, host_lshape, form, host_lj1,
+                      host_lj2, host_lj3, host_lj4, offset, special_lj,
                       inum, nall, max_nbors, maxspecial, cell_size, gpu_split,
                       screen);
 
   GBMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
-        
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
@@ -84,7 +84,7 @@ int gb_gpu_init(const int ntypes, const double gamma,
                         max_nbors, maxspecial, cell_size, gpu_split,  screen);
 
     GBMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -105,8 +105,8 @@ void gb_gpu_clear() {
 int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double **host_quat);
 
@@ -117,8 +117,8 @@ int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
                        const bool vatom, int &host_start, int **ilist,
                        int **jnum, const double cpu_time, bool &success,
                        double **host_quat) {
-  return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, 
-                      tag, nspecial, special, eflag, vflag, eatom, vatom, 
+  return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
+                      tag, nspecial, special, eflag, vflag, eatom, vatom,
                       host_start, ilist, jnum, cpu_time, success, host_quat);
 }
 
diff --git a/lib/gpu/lal_gayberne_lj.cu b/lib/gpu/lal_gayberne_lj.cu
index 9b33b5f7f3..7925b72784 100644
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@@ -18,30 +18,30 @@
 #endif
 
 __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
-                                          const __global numtyp4 *restrict q, 
+                                          const __global numtyp4 *restrict q,
                                           const __global numtyp4 *restrict shape,
-                                          const __global numtyp4 *restrict well, 
-                                          const __global numtyp *restrict gum, 
+                                          const __global numtyp4 *restrict well,
+                                          const __global numtyp *restrict gum,
                                           const __global numtyp2 *restrict sig_eps,
-                                          const int ntypes, 
+                                          const int ntypes,
                                           const __global numtyp *restrict lshape,
-                                          const __global int *dev_nbor, 
+                                          const __global int *dev_nbor,
                                           const int stride,
-                                          __global acctyp4 *restrict ans, 
+                                          __global acctyp4 *restrict ans,
                                           __global acctyp *restrict engv,
-                                          __global int *restrict err_flag, 
+                                          __global int *restrict err_flag,
                                           const int eflag, const int vflag,
-                                          const int start, const int inum, 
+                                          const int start, const int inum,
                                           const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
 
   __local numtyp sp_lj[4];
-  sp_lj[0]=gum[3];    
-  sp_lj[1]=gum[4];    
-  sp_lj[2]=gum[5];    
-  sp_lj[3]=gum[6];    
+  sp_lj[0]=gum[3];
+  sp_lj[1]=gum[4];
+  sp_lj[2]=gum[5];
+  sp_lj[3]=gum[6];
 
   acctyp energy=(acctyp)0;
   acctyp4 f;
@@ -58,16 +58,16 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
-      
+
     numtyp oner=shape[itype].x;
     numtyp one_well=well[itype].x;
-  
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_nbor[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -84,7 +84,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
 
       ir = ucl_rsqrt(ir);
       numtyp r = ucl_recip(ir);
-      
+
       numtyp r12hat[3];
       r12hat[0]=r12[0]*ir;
       r12hat[1]=r12[1]*ir;
@@ -92,7 +92,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
 
       numtyp a2[9];
       gpu_quat_to_mat_trans(q,j,a2);
-  
+
       numtyp u_r, dUr[3], eta;
       { // Compute U_r, dUr, eta, and teta
         // Compute g12
@@ -110,11 +110,11 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
             g12[3]=g2[3];
             g12[5]=g2[5];
             g12[6]=g2[6];
-            g12[7]=g2[7];    
+            g12[7]=g2[7];
           }
-  
+
           { // Compute U_r and dUr
-    
+
             // Compute kappa
             numtyp kappa[3];
             gpu_mldivide3(g12,r12,kappa,err_flag);
@@ -123,9 +123,9 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
             kappa[0]*=ir;
             kappa[1]*=ir;
             kappa[2]*=ir;
-  
+
             // energy
-  
+
             // compute u_r and dUr
             numtyp uslj_rsq;
             {
@@ -139,7 +139,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
               kappa[0]*=r;
               kappa[1]*=r;
               kappa[2]*=r;
-          
+
               int mtype=fast_mul(ntypes,itype)+jtype;
               numtyp sigma = sig_eps[mtype].x;
               numtyp epsilon = sig_eps[mtype].y;
@@ -161,7 +161,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
             }
           }
         }
-     
+
         // Compute eta
         {
           eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
@@ -169,7 +169,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
           eta = ucl_powr(eta/det_g12,gum[1]);
         }
       }
-  
+
       numtyp chi, dchi[3];
       { // Compute chi and dchi
 
@@ -187,7 +187,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
           b12[3]=b2[3];
           b12[5]=b2[5];
           b12[6]=b2[6];
-          b12[7]=b2[7];    
+          b12[7]=b2[7];
         }
 
         // compute chi_12
@@ -244,16 +244,16 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_gayberne_lj(const __global numtyp4 *restrict x_, 
-                            const __global numtyp4 *restrict lj1, 
-                            const __global numtyp4 *restrict lj3, 
-                            const int lj_types, 
-                            const __global numtyp *restrict gum, 
-                            const int stride, 
-                            const __global int *dev_ij, 
-                            __global acctyp4 *restrict ans, 
-                            __global acctyp *restrict engv, 
-                            __global int *restrict err_flag, 
+__kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
+                            const __global numtyp4 *restrict lj1,
+                            const __global numtyp4 *restrict lj3,
+                            const int lj_types,
+                            const __global numtyp *restrict gum,
+                            const int stride,
+                            const __global int *dev_ij,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv,
+                            __global int *restrict err_flag,
                             const int eflag, const int vflag, const int start,
                             const int inum, const int t_per_atom) {
   int tid, ii, offset;
@@ -261,10 +261,10 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
   ii+=start;
 
   __local numtyp sp_lj[4];
-  sp_lj[0]=gum[3];    
-  sp_lj[1]=gum[4];    
-  sp_lj[2]=gum[5];    
-  sp_lj[3]=gum[6];    
+  sp_lj[0]=gum[3];
+  sp_lj[1]=gum[4];
+  sp_lj[2]=gum[5];
+  sp_lj[3]=gum[6];
 
   acctyp energy=(acctyp)0;
   acctyp4 f;
@@ -274,20 +274,20 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_ij[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -300,21 +300,21 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       int ii=itype*lj_types+jtype;
       if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
         force*=factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
-          energy+=factor_lj*(e-lj3[ii].z); 
+          energy+=factor_lj*(e-lj3[ii].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -332,33 +332,33 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_, 
-                                 const __global numtyp4 *restrict lj1_in, 
-                                 const __global numtyp4 *restrict lj3_in, 
-                                 const __global numtyp *restrict gum, 
-                                 const int stride, 
+__kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
+                                 const __global numtyp4 *restrict lj1_in,
+                                 const __global numtyp4 *restrict lj3_in,
+                                 const __global numtyp *restrict gum,
+                                 const int stride,
                                  const __global int *dev_ij,
-                                 __global acctyp4 *restrict ans, 
+                                 __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
-                                 __global int *restrict err_flag, 
-                                 const int eflag, const int vflag, 
-                                 const int start, const int inum, 
+                                 __global int *restrict err_flag,
+                                 const int eflag, const int vflag,
+                                 const int start, const int inum,
                                  const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
 
-  __local numtyp sp_lj[4];                              
+  __local numtyp sp_lj[4];
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   if (tid<4)
-    sp_lj[tid]=gum[tid+3];    
+    sp_lj[tid]=gum[tid+3];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0;
@@ -367,9 +367,9 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -383,7 +383,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_ij[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -396,19 +396,19 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_lj.cpp b/lib/gpu/lal_lj.cpp
index 6c6e145319..978b33e5d7 100644
--- a/lib/gpu/lal_lj.cpp
+++ b/lib/gpu/lal_lj.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -33,20 +33,20 @@ LJT::LJ() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-LJT::~LJ() { 
+LJT::~LJ() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int LJT::init(const int ntypes, 
-                          double **host_cutsq, double **host_lj1, 
-                          double **host_lj2, double **host_lj3, 
-                          double **host_lj4, double **host_offset, 
+int LJT::init(const int ntypes,
+                          double **host_cutsq, double **host_lj1,
+                          double **host_lj2, double **host_lj3,
+                          double **host_lj4, double **host_offset,
                           double *host_special_lj, const int nlocal,
                           const int nall, const int max_nbors,
                           const int maxspecial, const double cell_size,
@@ -76,11 +76,11 @@ int LJT::init(const int ntypes,
 
   lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq);
+                         host_cutsq);
 
   lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);
 
   UCL_H_Vec<double> dview;
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@@ -99,10 +99,10 @@ void LJT::reinit(const int ntypes, double **host_cutsq, double **host_lj1,
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2,
                          host_cutsq);
   this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4,
@@ -143,7 +143,7 @@ void LJT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -155,12 +155,12 @@ void LJT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch, 
+                          &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
diff --git a/lib/gpu/lal_lj.cu b/lib/gpu/lal_lj.cu
index 9569cb0fd7..5838ac95cf 100644
--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -24,15 +24,15 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_lj(const __global numtyp4 *restrict x_, 
+__kernel void k_lj(const __global numtyp4 *restrict x_,
                    const __global numtyp4 *restrict lj1,
-                   const __global numtyp4 *restrict lj3, 
-                   const int lj_types, 
-                   const __global numtyp *restrict sp_lj, 
-                   const __global int * dev_nbor, 
-                   const __global int * dev_packed, 
-                   __global acctyp4 *restrict ans, 
-                   __global acctyp *restrict engv, 
+                   const __global numtyp4 *restrict lj3,
+                   const int lj_types,
+                   const __global numtyp *restrict sp_lj,
+                   const __global int * dev_nbor,
+                   const __global int * dev_packed,
+                   __global acctyp4 *restrict ans,
+                   __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
                    const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -44,19 +44,19 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -69,21 +69,21 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (r2inv<lj1[mtype].z) {
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
         force*=factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -101,19 +101,19 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_lj_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_fast(const __global numtyp4 *restrict x_,
                         const __global numtyp4 *restrict lj1_in,
-                        const __global numtyp4 *restrict lj3_in, 
-                        const __global numtyp *restrict sp_lj_in, 
-                        const __global int * dev_nbor, 
-                        const __global int * dev_packed, 
-                        __global acctyp4 *restrict ans, 
-                        __global acctyp *restrict engv, 
-                        const int eflag, const int vflag, const int inum, 
+                        const __global numtyp4 *restrict lj3_in,
+                        const __global numtyp *restrict sp_lj_in,
+                        const __global int * dev_nbor,
+                        const __global int * dev_packed,
+                        __global acctyp4 *restrict ans,
+                        __global acctyp *restrict engv,
+                        const int eflag, const int vflag, const int inum,
                         const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
@@ -124,7 +124,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -133,7 +133,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
@@ -146,7 +146,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -159,19 +159,19 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       if (r2inv<lj1[mtype].z) {
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_lj.h b/lib/gpu/lal_lj.h
index 63a3e8a6c9..01ce85c8ea 100644
--- a/lib/gpu/lal_lj.h
+++ b/lib/gpu/lal_lj.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class LJ : public BaseAtomic<numtyp, acctyp> {
  public:
   LJ();
-  ~LJ(); 
+  ~LJ();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,15 +40,15 @@ class LJ : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq,
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
-  
+
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **host_cutsq,
               double **host_lj1, double **host_lj2, double **host_lj3,
               double **host_lj4, double **host_offset);
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -71,7 +71,7 @@ class LJ : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_lj96.cpp b/lib/gpu/lal_lj96.cpp
index 70e46b9fe1..191f211ae4 100644
--- a/lib/gpu/lal_lj96.cpp
+++ b/lib/gpu/lal_lj96.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -36,7 +36,7 @@ template <class numtyp, class acctyp>
 LJ96T::~LJ96() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJ96T::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -44,9 +44,9 @@ int LJ96T::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int LJ96T::init(const int ntypes,
-                           double **host_cutsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
+                           double **host_cutsq, double **host_lj1,
+                           double **host_lj2, double **host_lj3,
+                           double **host_lj4, double **host_offset,
                            double *host_special_lj, const int nlocal,
                            const int nall, const int max_nbors,
                            const int maxspecial, const double cell_size,
@@ -76,11 +76,11 @@ int LJ96T::init(const int ntypes,
 
   lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq);
+                         host_cutsq);
 
   lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);
 
   UCL_H_Vec<double> dview;
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@@ -126,7 +126,7 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -138,7 +138,7 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch, 
+                          &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
diff --git a/lib/gpu/lal_lj96.cu b/lib/gpu/lal_lj96.cu
index b219b8bf0d..3bb7750022 100644
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -26,13 +26,13 @@ texture<int4,1> pos_tex;
 
 __kernel void k_lj96(const __global numtyp4 *restrict x_,
                      const __global numtyp4 *restrict lj1,
-                     const __global numtyp4 *restrict lj3, 
-                     const int lj_types, 
-                     const __global numtyp *restrict sp_lj_in, 
-                     const __global int *dev_nbor, 
-                     const __global int *dev_packed, 
+                     const __global numtyp4 *restrict lj3,
+                     const int lj_types,
+                     const __global numtyp *restrict sp_lj_in,
+                     const __global int *dev_nbor,
+                     const __global int *dev_packed,
                      __global acctyp4 *restrict ans,
-                     __global acctyp *restrict engv, 
+                     __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -50,20 +50,20 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -76,7 +76,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (r2inv<lj1[mtype].z) {
         r2inv=ucl_recip(r2inv);
@@ -84,14 +84,14 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
         numtyp r3inv = ucl_sqrt(r6inv);
         numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
         force*=factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -109,15 +109,15 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_lj96_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict lj1_in,
-                          const __global numtyp4 *restrict lj3_in, 
+                          const __global numtyp4 *restrict lj3_in,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
-                          const int eflag, const int vflag, const int inum, 
+                          __global acctyp *restrict engv,
+                          const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -132,30 +132,30 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -168,20 +168,20 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       if (r2inv<lj1[mtype].z) {
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp r3inv = ucl_sqrt(r6inv);
         numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_lj96.h b/lib/gpu/lal_lj96.h
index 7d51e287d3..3fdea5265e 100644
--- a/lib/gpu/lal_lj96.h
+++ b/lib/gpu/lal_lj96.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class LJ96 : public BaseAtomic<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,8 +40,8 @@ class LJ96 : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, double **host_lj1,
            double **host_lj2, double **host_lj3, double **host_lj4,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
@@ -66,7 +66,7 @@ class LJ96 : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_lj96_ext.cpp b/lib/gpu/lal_lj96_ext.cpp
index 14c32ef95e..5c4a58c5e8 100644
--- a/lib/gpu/lal_lj96_ext.cpp
+++ b/lib/gpu/lal_lj96_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -77,7 +77,7 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                           cell_size, gpu_split, screen);
 
     LJ96MF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,8 +102,8 @@ int** lj96_gpu_compute_n(const int ago, const int inum_full,
   return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void lj96_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_lj_class2_long.cpp b/lib/gpu/lal_lj_class2_long.cpp
index ef59843c4a..497e5989ad 100644
--- a/lib/gpu/lal_lj_class2_long.cpp
+++ b/lib/gpu/lal_lj_class2_long.cpp
@@ -38,7 +38,7 @@ template <class numtyp, class acctyp>
 LJClass2LongT::~LJClass2Long() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJClass2LongT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -46,8 +46,8 @@ int LJClass2LongT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int LJClass2LongT::init(const int ntypes, double **host_cutsq,
-                        double **host_lj1, double **host_lj2, double **host_lj3, 
-                        double **host_lj4, double **host_offset, 
+                        double **host_lj1, double **host_lj2, double **host_lj3,
+                        double **host_lj4, double **host_offset,
                         double *host_special_lj, const int nlocal,
                         const int nall, const int max_nbors,
                         const int maxspecial, const double cell_size,
@@ -80,11 +80,11 @@ int LJClass2LongT::init(const int ntypes, double **host_cutsq,
 
   lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq, host_cut_ljsq);
+                         host_cutsq, host_cut_ljsq);
 
   lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);
 
   sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
@@ -136,7 +136,7 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -145,11 +145,11 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, 
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
-                          &_cut_coulsq, &_qqrd2e, &_g_ewald, 
+                          &_cut_coulsq, &_qqrd2e, &_g_ewald,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
diff --git a/lib/gpu/lal_lj_class2_long.cu b/lib/gpu/lal_lj_class2_long.cu
index e16de3a327..41ceca35d7 100644
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@@ -32,15 +32,15 @@ texture<int2> q_tex;
 __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
                                const __global numtyp4 *restrict lj1,
                                const __global numtyp4 *restrict lj3,
-                               const int lj_types, 
+                               const int lj_types,
                                const __global numtyp *restrict sp_lj_in,
-                               const __global int *dev_nbor, 
+                               const __global int *dev_nbor,
                                const __global int *dev_packed,
                                __global acctyp4 *restrict ans,
                                __global acctyp *restrict engv,
-                               const int eflag,  const int vflag, 
-                               const int inum, const int nbor_pitch, 
-                               const __global numtyp *restrict q_, 
+                               const int eflag,  const int vflag,
+                               const int inum, const int nbor_pitch,
+                               const __global numtyp *restrict q_,
                                const numtyp cut_coulsq, const numtyp qqrd2e,
                                const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
@@ -63,14 +63,14 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -129,7 +129,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
           if (rsq < lj1[mtype].w) {
             numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -147,20 +147,20 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
                                     const __global numtyp4 *restrict lj1_in,
-                                    const __global numtyp4 *restrict lj3_in, 
+                                    const __global numtyp4 *restrict lj3_in,
                                     const __global numtyp *restrict sp_lj_in,
-                                    const __global int *dev_nbor, 
+                                    const __global int *dev_nbor,
                                     const __global int *dev_packed,
-                                    __global acctyp4 *restrict ans, 
-                                    __global acctyp *restrict engv, 
-                                    const int eflag, const int vflag, 
-                                    const int inum, const int nbor_pitch, 
+                                    __global acctyp4 *restrict ans,
+                                    __global acctyp *restrict engv,
+                                    const int eflag, const int vflag,
+                                    const int inum, const int nbor_pitch,
                                     const __global numtyp *restrict q_,
-                                    const numtyp cut_coulsq, 
+                                    const numtyp cut_coulsq,
                                     const numtyp qqrd2e,
-                                    const numtyp g_ewald, 
+                                    const numtyp g_ewald,
                                     const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -175,7 +175,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -183,16 +183,16 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
diff --git a/lib/gpu/lal_lj_class2_long.h b/lib/gpu/lal_lj_class2_long.h
index 9dd151f63a..d07b974a90 100644
--- a/lib/gpu/lal_lj_class2_long.h
+++ b/lib/gpu/lal_lj_class2_long.h
@@ -30,7 +30,7 @@ class LJClass2Long : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,8 +40,8 @@ class LJClass2Long : public BaseCharge<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq,
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const double qqrd2e, const double g_ewald);
@@ -68,7 +68,7 @@ class LJClass2Long : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _cut_coulsq, _qqrd2e, _g_ewald;
diff --git a/lib/gpu/lal_lj_class2_long_ext.cpp b/lib/gpu/lal_lj_class2_long_ext.cpp
index 4bb3aad7ad..6ed15126d9 100644
--- a/lib/gpu/lal_lj_class2_long_ext.cpp
+++ b/lib/gpu/lal_lj_class2_long_ext.cpp
@@ -82,7 +82,7 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                           host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
     C2CLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -99,7 +99,7 @@ void c2cl_gpu_clear() {
 
 int** c2cl_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
@@ -109,8 +109,8 @@ int** c2cl_gpu_compute_n(const int ago, const int inum_full,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
-}  
-			
+}
+
 void c2cl_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_lj_coul.cpp b/lib/gpu/lal_lj_coul.cpp
index 8030f3cfc2..a8255318bd 100644
--- a/lib/gpu/lal_lj_coul.cpp
+++ b/lib/gpu/lal_lj_coul.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 LJCoulT::~LJCoul() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJCoulT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,9 +45,9 @@ int LJCoulT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int LJCoulT::init(const int ntypes,
-                          double **host_cutsq, double **host_lj1, 
-                          double **host_lj2, double **host_lj3, 
-                          double **host_lj4, double **host_offset, 
+                          double **host_cutsq, double **host_lj1,
+                          double **host_lj2, double **host_lj3,
+                          double **host_lj4, double **host_offset,
                           double *host_special_lj, const int nlocal,
                           const int nall, const int max_nbors,
                           const int maxspecial, const double cell_size,
@@ -79,11 +79,11 @@ int LJCoulT::init(const int ntypes,
 
   lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cut_ljsq, host_cut_coulsq);
+                         host_cut_ljsq, host_cut_coulsq);
 
   lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);
 
   cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
@@ -138,7 +138,7 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -149,14 +149,14 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
                           &cutsq, &_qqrd2e, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, 
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv,
                      &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
                      &cutsq, &_qqrd2e, &this->_threads_per_atom);
   }
diff --git a/lib/gpu/lal_lj_coul.cu b/lib/gpu/lal_lj_coul.cu
index 364203db22..5c7f0da46f 100644
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -29,19 +29,19 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_lj_coul(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_coul(const __global numtyp4 *restrict x_,
                         const __global numtyp4 *restrict lj1,
-                        const __global numtyp4 *restrict  lj3, 
-                        const int lj_types, 
+                        const __global numtyp4 *restrict  lj3,
+                        const int lj_types,
                         const __global numtyp *restrict sp_lj_in,
-                        const __global int *dev_nbor, 
-                        const __global int *dev_packed, 
+                        const __global int *dev_nbor,
+                        const __global int *dev_packed,
                         __global acctyp4 *restrict ans,
-                        __global acctyp *restrict engv, 
+                        __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
-                        const int nbor_pitch, 
-                        const __global numtyp *restrict q_, 
-                        const __global numtyp *restrict cutsq, 
+                        const int nbor_pitch,
+                        const __global numtyp *restrict q_,
+                        const __global numtyp *restrict cutsq,
                         const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -63,14 +63,14 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -120,7 +120,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
           if (rsq < lj1[mtype].z) {
             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -140,16 +140,16 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
 
 __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict lj1_in,
-                             const __global numtyp4 *restrict lj3_in, 
+                             const __global numtyp4 *restrict lj3_in,
                              const __global numtyp *restrict sp_lj_in,
-                             const __global int *dev_nbor, 
+                             const __global int *dev_nbor,
                              const __global int *dev_packed,
                              __global acctyp4 *restrict ans,
-                             __global acctyp *restrict engv, 
-                             const int eflag, const int vflag, const int inum, 
-                             const int nbor_pitch, 
+                             __global acctyp *restrict engv,
+                             const int eflag, const int vflag, const int inum,
+                             const int nbor_pitch,
                              const __global numtyp *restrict q_,
-                             const __global numtyp *restrict _cutsq, 
+                             const __global numtyp *restrict _cutsq,
                              const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -166,7 +166,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -174,16 +174,16 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
diff --git a/lib/gpu/lal_lj_coul.h b/lib/gpu/lal_lj_coul.h
index abea5a2d55..a262c0837f 100644
--- a/lib/gpu/lal_lj_coul.h
+++ b/lib/gpu/lal_lj_coul.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class LJCoul : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,7 +40,7 @@ class LJCoul : public BaseCharge<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, double **host_lj1,
            double **host_lj2, double **host_lj3, double **host_lj4,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            double **host_cut_coulsq, double *host_special_coul,
@@ -70,7 +70,7 @@ class LJCoul : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e;
diff --git a/lib/gpu/lal_lj_coul_debye.cpp b/lib/gpu/lal_lj_coul_debye.cpp
index 135a4dfd9d..92167f314f 100644
--- a/lib/gpu/lal_lj_coul_debye.cpp
+++ b/lib/gpu/lal_lj_coul_debye.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 LJCoulDebyeT::~LJCoulDebye() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJCoulDebyeT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,9 +45,9 @@ int LJCoulDebyeT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int LJCoulDebyeT::init(const int ntypes,
-                       double **host_cutsq, double **host_lj1, 
-                       double **host_lj2, double **host_lj3, 
-                       double **host_lj4, double **host_offset, 
+                       double **host_cutsq, double **host_lj1,
+                       double **host_lj2, double **host_lj3,
+                       double **host_lj4, double **host_offset,
                        double *host_special_lj, const int nlocal,
                        const int nall, const int max_nbors,
                        const int maxspecial, const double cell_size,
@@ -80,11 +80,11 @@ int LJCoulDebyeT::init(const int ntypes,
 
   lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			                   host_cut_ljsq, host_cut_coulsq);
+                                           host_cut_ljsq, host_cut_coulsq);
 
   lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		                     host_offset);
+                                     host_offset);
 
   cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
@@ -98,7 +98,7 @@ int LJCoulDebyeT::init(const int ntypes,
 
   _qqrd2e=qqrd2e;
   _kappa=kappa;
-  
+
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
                    sp_lj.row_bytes();
@@ -140,7 +140,7 @@ void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -157,9 +157,9 @@ void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) {
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
-                     &ainum, &nbor_pitch, &this->atom->q, &cutsq, 
+                     &ainum, &nbor_pitch, &this->atom->q, &cutsq,
                      &_qqrd2e, &_kappa, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_lj_coul_debye.cu b/lib/gpu/lal_lj_coul_debye.cu
index 308504c6c8..91b105b3da 100644
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -29,19 +29,19 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_lj_debye(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_debye(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict lj1,
-                         const __global numtyp4 *restrict lj3, 
-                         const int lj_types, 
-                         const __global numtyp *restrict sp_lj_in, 
-                         const __global int *dev_nbor, 
-                         const __global int *dev_packed, 
+                         const __global numtyp4 *restrict lj3,
+                         const int lj_types,
+                         const __global numtyp *restrict sp_lj_in,
+                         const __global int *dev_nbor,
+                         const __global int *dev_packed,
                          __global acctyp4 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
                          const __global numtyp *restrict q_ ,
-                         const __global numtyp *restrict cutsq, 
+                         const __global numtyp *restrict cutsq,
                          const numtyp qqrd2e, const numtyp kappa,
                          const int t_per_atom) {
   int tid, ii, offset;
@@ -64,14 +64,14 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -86,7 +86,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
@@ -127,7 +127,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
           }
           if (rsq < lj1[mtype].w) {
             e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul;
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -147,15 +147,15 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
 
 __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp4 *restrict lj1_in,
-                              const __global numtyp4 *restrict lj3_in, 
+                              const __global numtyp4 *restrict lj3_in,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
                               __global acctyp4 *restrict ans,
-                              __global acctyp *restrict engv, 
-                              const int eflag, const int vflag, const int inum, 
-                              const int nbor_pitch, 
-                              const __global numtyp *restrict q_, 
+                              __global acctyp *restrict engv,
+                              const int eflag, const int vflag, const int inum,
+                              const int nbor_pitch,
+                              const __global numtyp *restrict q_,
                               const __global numtyp *restrict _cutsq,
                               const numtyp qqrd2e, const numtyp kappa,
                               const int t_per_atom) {
@@ -174,7 +174,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -182,16 +182,16 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
diff --git a/lib/gpu/lal_lj_coul_debye.h b/lib/gpu/lal_lj_coul_debye.h
index 73e38c647d..1d3d0ba375 100644
--- a/lib/gpu/lal_lj_coul_debye.h
+++ b/lib/gpu/lal_lj_coul_debye.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class LJCoulDebye : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,7 +40,7 @@ class LJCoulDebye : public BaseCharge<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, double **host_lj1,
            double **host_lj2, double **host_lj3, double **host_lj4,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            double **host_cut_coulsq, double *host_special_coul,
@@ -70,7 +70,7 @@ class LJCoulDebye : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e,_kappa;
diff --git a/lib/gpu/lal_lj_coul_debye_ext.cpp b/lib/gpu/lal_lj_coul_debye_ext.cpp
index 67f5a0075f..3a0a3593e7 100644
--- a/lib/gpu/lal_lj_coul_debye_ext.cpp
+++ b/lib/gpu/lal_lj_coul_debye_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,7 +33,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                   const int nall, const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen,
                   double **host_cut_ljsq, double **host_cut_coulsq,
-                  double *host_special_coul, const double qqrd2e, 
+                  double *host_special_coul, const double qqrd2e,
                   const double kappa) {
   LJCDMF.clear();
   gpu_mode=LJCDMF.device->gpu_mode();
@@ -82,7 +82,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                           host_cut_coulsq, host_special_coul, qqrd2e, kappa);
 
     LJCDMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -99,7 +99,7 @@ void ljcd_gpu_clear() {
 
 int** ljcd_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
@@ -109,8 +109,8 @@ int** ljcd_gpu_compute_n(const int ago, const int inum_full,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
-}  
-			
+}
+
 void ljcd_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_lj_coul_ext.cpp b/lib/gpu/lal_lj_coul_ext.cpp
index 3b5cc09805..b803101b9e 100644
--- a/lib/gpu/lal_lj_coul_ext.cpp
+++ b/lib/gpu/lal_lj_coul_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -81,7 +81,7 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          host_cut_coulsq, host_special_coul, qqrd2e);
 
     LJCMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -98,7 +98,7 @@ void ljc_gpu_clear() {
 
 int** ljc_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                        double *sublo, double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
@@ -108,8 +108,8 @@ int** ljc_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
-}  
-			
+}
+
 void ljc_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_lj_coul_long.cpp b/lib/gpu/lal_lj_coul_long.cpp
index 03f32a5fd0..29d648bed2 100644
--- a/lib/gpu/lal_lj_coul_long.cpp
+++ b/lib/gpu/lal_lj_coul_long.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 LJCoulLongT::~LJCoulLong() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJCoulLongT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,9 +45,9 @@ int LJCoulLongT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int LJCoulLongT::init(const int ntypes,
-                           double **host_cutsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
+                           double **host_cutsq, double **host_lj1,
+                           double **host_lj2, double **host_lj3,
+                           double **host_lj4, double **host_offset,
                            double *host_special_lj, const int nlocal,
                            const int nall, const int max_nbors,
                            const int maxspecial, const double cell_size,
@@ -80,11 +80,11 @@ int LJCoulLongT::init(const int ntypes,
 
   lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-	   host_cutsq, host_cut_ljsq);
+           host_cutsq, host_cut_ljsq);
 
   lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);
 
   sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
@@ -109,10 +109,10 @@ void LJCoulLongT::reinit(const int ntypes, double **host_cutsq, double **host_lj
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2,
                          host_cutsq, host_cut_ljsq);
   this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4,
@@ -153,7 +153,7 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -162,7 +162,7 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, 
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
diff --git a/lib/gpu/lal_lj_coul_long.cu b/lib/gpu/lal_lj_coul_long.cu
index e0aa2e8a58..0e25bb2dbc 100644
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -29,17 +29,17 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict lj1,
-                             const __global numtyp4 *restrict lj3, 
-                             const int lj_types, 
-                             const __global numtyp *restrict sp_lj_in, 
-                             const __global int *dev_nbor, 
+                             const __global numtyp4 *restrict lj3,
+                             const int lj_types,
+                             const __global numtyp *restrict sp_lj_in,
+                             const __global int *dev_nbor,
                              const __global int *dev_packed,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
-                             const int nbor_pitch, 
+                             const int nbor_pitch,
                              const __global numtyp *restrict q_,
                              const numtyp cut_coulsq, const numtyp qqrd2e,
                              const numtyp g_ewald, const int t_per_atom) {
@@ -63,14 +63,14 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
@@ -127,7 +127,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
           if (rsq < lj1[mtype].w) {
             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -145,14 +145,14 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
                                   const __global numtyp4 *restrict lj1_in,
-                                  const __global numtyp4 *restrict lj3_in, 
+                                  const __global numtyp4 *restrict lj3_in,
                                   const __global numtyp *restrict sp_lj_in,
-                                  const __global int *dev_nbor, 
+                                  const __global int *dev_nbor,
                                   const __global int *dev_packed,
-                                  __global acctyp4 *restrict ans, 
-                                  __global acctyp *restrict engv, 
+                                  __global acctyp4 *restrict ans,
+                                  __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum,  const int nbor_pitch,
                                   const __global numtyp *restrict q_,
@@ -171,7 +171,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -179,16 +179,16 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
diff --git a/lib/gpu/lal_lj_coul_long.h b/lib/gpu/lal_lj_coul_long.h
index 2708cab05b..7b2d79c2a6 100644
--- a/lib/gpu/lal_lj_coul_long.h
+++ b/lib/gpu/lal_lj_coul_long.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class LJCoulLong : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,8 +40,8 @@ class LJCoulLong : public BaseCharge<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq,
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const double qqrd2e, const double g_ewald);
@@ -73,7 +73,7 @@ class LJCoulLong : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _cut_coulsq, _qqrd2e, _g_ewald;
diff --git a/lib/gpu/lal_lj_coul_long_ext.cpp b/lib/gpu/lal_lj_coul_long_ext.cpp
index dc93365f22..6f8b5c9fe1 100644
--- a/lib/gpu/lal_lj_coul_long_ext.cpp
+++ b/lib/gpu/lal_lj_coul_long_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -82,7 +82,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                           host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
     LJCLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,15 +102,15 @@ void ljcl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
   int world_me=LJCLMF.device->world_me();
   int gpu_rank=LJCLMF.device->gpu_rank();
   int procs_per_gpu=LJCLMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
-    LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, 
+    LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                   offset, host_cut_ljsq);
   LJCLMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
-      LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, 
+      LJCLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                     offset, host_cut_ljsq);
     LJCLMF.device->gpu_barrier();
   }
@@ -122,7 +122,7 @@ void ljcl_gpu_clear() {
 
 int** ljcl_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
@@ -132,8 +132,8 @@ int** ljcl_gpu_compute_n(const int ago, const int inum_full,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
-}  
-			
+}
+
 void ljcl_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_lj_coul_msm.cpp b/lib/gpu/lal_lj_coul_msm.cpp
index dd045b7970..1358de9ee1 100644
--- a/lib/gpu/lal_lj_coul_msm.cpp
+++ b/lib/gpu/lal_lj_coul_msm.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 LJCoulMSMT::~LJCoulMSM() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJCoulMSMT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,8 +45,8 @@ int LJCoulMSMT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int LJCoulMSMT::init(const int ntypes,
-                     double **host_cutsq, double **host_lj1, 
-                     double **host_lj2, double **host_lj3, 
+                     double **host_cutsq, double **host_lj1,
+                     double **host_lj2, double **host_lj3,
                      double **host_lj4, double **host_gcons,
                      double **host_dgcons, double **host_offset,
                      double *host_special_lj, const int nlocal,
@@ -81,11 +81,11 @@ int LJCoulMSMT::init(const int ntypes,
 
   lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq, host_cut_ljsq);
+                         host_cutsq, host_cut_ljsq);
 
   lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);
 
   // pack gcons and dgcons
   int nrows, ncols;
@@ -93,11 +93,11 @@ int LJCoulMSMT::init(const int ntypes,
   ncols = 7;
   UCL_H_Vec<numtyp> dview_gcons(nrows*ncols,*(this->ucl_device),
                                 UCL_WRITE_ONLY);
-                               
+
   for (int ix=0; ix<nrows; ix++)
     for (int iy=0; iy<ncols; iy++)
       dview_gcons[ix*ncols+iy]=host_gcons[ix][iy];
-  
+
   gcons.alloc(nrows*ncols,*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(gcons,dview_gcons,false);
   gcons_tex.get_texture(*(this->pair_program),"gcons_tex");
@@ -107,11 +107,11 @@ int LJCoulMSMT::init(const int ntypes,
   ncols = 6;
   UCL_H_Vec<numtyp> dview_dgcons(nrows*ncols,*(this->ucl_device),
                                  UCL_WRITE_ONLY);
-                               
+
   for (int ix=0; ix<nrows; ix++)
     for (int iy=0; iy<ncols; iy++)
       dview_dgcons[ix*ncols+iy]=host_dgcons[ix][iy];
-  
+
   dgcons.alloc(nrows*ncols,*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(dgcons,dview_dgcons,false);
   dgcons_tex.get_texture(*(this->pair_program),"dgcons_tex");
@@ -170,7 +170,7 @@ void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -179,7 +179,7 @@ void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons, &sp_lj, 
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
diff --git a/lib/gpu/lal_lj_coul_msm.cu b/lib/gpu/lal_lj_coul_msm.cu
index 0c7c3cdace..3f73c6f47d 100644
--- a/lib/gpu/lal_lj_coul_msm.cu
+++ b/lib/gpu/lal_lj_coul_msm.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -80,19 +80,19 @@ ucl_inline numtyp dgamma(const numtyp rho, const int order,
     return ((numtyp)-1.0/rho/rho);
 }
 
-__kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict lj1,
                              const __global numtyp4 *restrict lj3,
                              const __global numtyp *restrict gcons,
                              const __global numtyp *restrict dgcons,
                              const int lj_types,
-                             const __global numtyp *restrict sp_lj_in, 
-                             const __global int *dev_nbor, 
+                             const __global numtyp *restrict sp_lj_in,
+                             const __global int *dev_nbor,
                              const __global int *dev_packed,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
-                             const int nbor_pitch, 
+                             const int nbor_pitch,
                              const __global numtyp *restrict q_,
                              const numtyp cut_coulsq, const numtyp qqrd2e,
                              const int order, const int t_per_atom) {
@@ -116,20 +116,20 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
-    
+
     numtyp cut_coul = ucl_sqrt(cut_coulsq);
-    
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
 
@@ -181,7 +181,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
           if (rsq < lj1[mtype].w) {
             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
-          } 
+          }
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -199,7 +199,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp4 *restrict lj1_in,
                                  const __global numtyp4 *restrict lj3_in,
                                  const __global numtyp *restrict gcons,
@@ -227,7 +227,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -235,16 +235,16 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
diff --git a/lib/gpu/lal_lj_coul_msm.h b/lib/gpu/lal_lj_coul_msm.h
index f1ef71cc2b..48d49a8742 100644
--- a/lib/gpu/lal_lj_coul_msm.h
+++ b/lib/gpu/lal_lj_coul_msm.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class LJCoulMSM : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -41,8 +41,8 @@ class LJCoulMSM : public BaseCharge<numtyp, acctyp> {
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double **host_gcons, double **host_dgcons,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
            const int order, const double qqrd2e);
@@ -65,14 +65,14 @@ class LJCoulMSM : public BaseCharge<numtyp, acctyp> {
   UCL_D_Vec<numtyp4> lj3;
   /// Special LJ values [0-3] and Special Coul values [4-7]
   UCL_D_Vec<numtyp> sp_lj;
-  
+
   UCL_D_Vec<numtyp> gcons, dgcons;
   UCL_Texture gcons_tex, dgcons_tex;
-  
+
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _cut_coulsq, _qqrd2e;
diff --git a/lib/gpu/lal_lj_coul_msm_ext.cpp b/lib/gpu/lal_lj_coul_msm_ext.cpp
index ecf3254cf9..bf520e4dc5 100644
--- a/lib/gpu/lal_lj_coul_msm_ext.cpp
+++ b/lib/gpu/lal_lj_coul_msm_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -84,7 +84,7 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                           host_cut_coulsq, host_special_coul, order, qqrd2e);
 
     LJCMLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -101,7 +101,7 @@ void ljcm_gpu_clear() {
 
 int** ljcm_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
@@ -111,8 +111,8 @@ int** ljcm_gpu_compute_n(const int ago, const int inum_full,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
-}  
-			
+}
+
 void ljcm_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_lj_cubic.cpp b/lib/gpu/lal_lj_cubic.cpp
index 25f83166e1..21ea22845c 100644
--- a/lib/gpu/lal_lj_cubic.cpp
+++ b/lib/gpu/lal_lj_cubic.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndactrung@gmail.com
  ***************************************************************************/
 
@@ -33,21 +33,21 @@ LJCubicT::LJCubic() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-LJCubicT::~LJCubic() { 
+LJCubicT::~LJCubic() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJCubicT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int LJCubicT::init(const int ntypes, 
+int LJCubicT::init(const int ntypes,
                    double **host_cutsq, double **host_cut_inner_sq,
-                   double **host_cut_inner, double **host_sigma, 
-                   double **host_epsilon, double **host_lj1, 
-                   double **host_lj2, double **host_lj3, double **host_lj4, 
+                   double **host_cut_inner, double **host_sigma,
+                   double **host_epsilon, double **host_lj1,
+                   double **host_lj2, double **host_lj3, double **host_lj4,
                    double *host_special_lj, const int nlocal,
                    const int nall, const int max_nbors,
                    const int maxspecial, const double cell_size,
@@ -77,11 +77,11 @@ int LJCubicT::init(const int ntypes,
 
   lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq);
+                         host_cutsq);
 
   lj2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj2,host_write,host_cut_inner_sq,
-			 host_cut_inner,host_sigma,host_epsilon);
+                         host_cut_inner,host_sigma,host_epsilon);
 
   lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack2(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4);
@@ -132,7 +132,7 @@ void LJCubicT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -144,12 +144,12 @@ void LJCubicT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &lj1, &lj2, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch, 
+                          &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj2, &lj3, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &lj1, &lj2, &lj3, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
diff --git a/lib/gpu/lal_lj_cubic.cu b/lib/gpu/lal_lj_cubic.cu
index 420689383f..a4b1992f33 100644
--- a/lib/gpu/lal_lj_cubic.cu
+++ b/lib/gpu/lal_lj_cubic.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : ndactrung@gmail.com
 // ***************************************************************************/
 
@@ -31,16 +31,16 @@ texture<int4,1> pos_tex;
 #define _DPHIDS (numtyp)2.6899009  // gradient at s
 #define _A3 (numtyp)27.93357       // cubic coefficient
 
-__kernel void k_lj_cubic(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict lj1,
                          const __global numtyp4 *restrict lj2,
-                         const __global numtyp2 *restrict lj3, 
-                         const int lj_types, 
-                         const __global numtyp *restrict sp_lj, 
-                         const __global int * dev_nbor, 
-                         const __global int * dev_packed, 
-                         __global acctyp4 *restrict ans, 
-                         __global acctyp *restrict engv, 
+                         const __global numtyp2 *restrict lj3,
+                         const int lj_types,
+                         const __global numtyp *restrict sp_lj,
+                         const __global int * dev_nbor,
+                         const __global int * dev_packed,
+                         __global acctyp4 *restrict ans,
+                         __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -52,19 +52,19 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -77,7 +77,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<lj1[mtype].z) {
         numtyp r2inv,r6inv,force,t;
@@ -93,18 +93,18 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
         }
 
         force*=factor_lj*r2inv;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e;
-          if (rsq <= lj2[mtype].x) 
+          if (rsq <= lj2[mtype].x)
             e = r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           else
             e = lj2[mtype].w*(_PHIS + _DPHIDS*t - _A3*t*t*t/6.0);
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -122,20 +122,20 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp4 *restrict lj1_in,
                               const __global numtyp4 *restrict lj2_in,
-                              const __global numtyp2 *restrict lj3_in, 
-                              const __global numtyp *restrict sp_lj_in, 
-                              const __global int * dev_nbor, 
-                              const __global int * dev_packed, 
-                              __global acctyp4 *restrict ans, 
-                              __global acctyp *restrict engv, 
-                              const int eflag, const int vflag, const int inum, 
+                              const __global numtyp2 *restrict lj3_in,
+                              const __global numtyp *restrict sp_lj_in,
+                              const __global int * dev_nbor,
+                              const __global int * dev_packed,
+                              __global acctyp4 *restrict ans,
+                              __global acctyp *restrict engv,
+                              const int eflag, const int vflag, const int inum,
                               const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp2 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@@ -148,7 +148,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -157,7 +157,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
@@ -170,7 +170,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -183,7 +183,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<lj1[mtype].z) {
         numtyp r2inv,r6inv,force,t;
         r2inv=ucl_recip(rsq);
@@ -198,18 +198,18 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
         }
 
         force*=factor_lj*r2inv;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e;
-          if (rsq <= lj2[mtype].x) 
+          if (rsq <= lj2[mtype].x)
             e = r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           else
             e = lj2[mtype].w*(_PHIS + _DPHIDS*t - _A3*t*t*t/6.0);
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_lj_cubic.h b/lib/gpu/lal_lj_cubic.h
index 0fefc727eb..818fb3581b 100644
--- a/lib/gpu/lal_lj_cubic.h
+++ b/lib/gpu/lal_lj_cubic.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndactrung@gmail.com
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class LJCubic : public BaseAtomic<numtyp, acctyp> {
  public:
   LJCubic();
-  ~LJCubic(); 
+  ~LJCubic();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -39,11 +39,11 @@ class LJCubic : public BaseAtomic<numtyp, acctyp> {
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq, double **host_cut_inner_sq,
            double **host_cut_inner, double **host_sigma, double **host_epsilon,
-           double **host_lj1, double **host_lj2, double **host_lj3, 
-           double **host_lj4, double *host_special_lj, const int nlocal, 
-           const int nall, const int max_nbors, const int maxspecial, 
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double *host_special_lj, const int nlocal,
+           const int nall, const int max_nbors, const int maxspecial,
            const double cell_size, const double gpu_split, FILE *screen);
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -68,7 +68,7 @@ class LJCubic : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_lj_cubic_ext.cpp b/lib/gpu/lal_lj_cubic_ext.cpp
index 518f706781..efbcee0a9f 100644
--- a/lib/gpu/lal_lj_cubic_ext.cpp
+++ b/lib/gpu/lal_lj_cubic_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndactrung@gmail.com
  ***************************************************************************/
 
@@ -27,11 +27,11 @@ static LJCubic<PRECISION,ACC_PRECISION> LJCubicLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq, 
+int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq,
                   double **cut_inner, double **sigma, double **epsilon,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                  double **host_lj4, double *special_lj, 
-                  const int inum, const int nall, const int max_nbors, 
+                  double **host_lj1, double **host_lj2, double **host_lj3,
+                  double **host_lj4, double *special_lj,
+                  const int inum, const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size,
                   int &gpu_mode, FILE *screen) {
   LJCubicLMF.clear();
@@ -81,7 +81,7 @@ int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq,
                               cell_size, gpu_split, screen);
 
     LJCubicLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -106,8 +106,8 @@ int ** ljcb_gpu_compute_n(const int ago, const int inum_full,
   return LJCubicLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void ljcb_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_lj_dsf.cpp b/lib/gpu/lal_lj_dsf.cpp
index 1b8fdeabb0..1efac3e821 100644
--- a/lib/gpu/lal_lj_dsf.cpp
+++ b/lib/gpu/lal_lj_dsf.cpp
@@ -37,22 +37,22 @@ template <class numtyp, class acctyp>
 LJDSFT::~LJDSF() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJDSFT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1, 
+int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1,
                  double **host_lj2, double **host_lj3,  double **host_lj4,
-                 double **host_offset,  double *host_special_lj, 
+                 double **host_offset,  double *host_special_lj,
                  const int nlocal, const int nall, const int max_nbors,
-                 const int maxspecial, const double cell_size, 
+                 const int maxspecial, const double cell_size,
                  const double gpu_split, FILE *_screen,
                  double **host_cut_ljsq, const double host_cut_coulsq,
                  double *host_special_coul, const double qqrd2e,
-                 const double e_shift, const double f_shift, 
+                 const double e_shift, const double f_shift,
                  const double alpha) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@@ -84,11 +84,11 @@ int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1,
 
   lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cut_ljsq, host_cutsq);
+                         host_cut_ljsq, host_cutsq);
 
   lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);
 
   sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<4; i++) {
@@ -138,7 +138,7 @@ void LJDSFT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -149,15 +149,15 @@ void LJDSFT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
                           &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, 
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv,
                      &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
                      &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
                      &this->_threads_per_atom);
diff --git a/lib/gpu/lal_lj_dsf.cu b/lib/gpu/lal_lj_dsf.cu
index 5e0cd4aca9..323576fe77 100644
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@@ -31,20 +31,20 @@ texture<int2> q_tex;
 
 #define MY_PIS (acctyp)1.77245385090551602729
 
-__kernel void k_lj_dsf(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
                        const __global numtyp4 *restrict lj1,
-                       const __global numtyp4 *restrict lj3, 
-                       const int lj_types, 
-                       const __global numtyp *restrict sp_lj_in, 
-                       const __global int *dev_nbor, 
-                       const __global int *dev_packed, 
+                       const __global numtyp4 *restrict lj3,
+                       const int lj_types,
+                       const __global numtyp *restrict sp_lj_in,
+                       const __global int *dev_nbor,
+                       const __global int *dev_packed,
                        __global acctyp4 *restrict ans,
-                       __global acctyp *restrict engv, 
+                       __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
-                       const int nbor_pitch, 
+                       const int nbor_pitch,
                        const __global numtyp *restrict q_ ,
                        const numtyp cut_coulsq, const numtyp qqrd2e,
-                       const numtyp e_shift, const numtyp f_shift, 
+                       const numtyp e_shift, const numtyp f_shift,
                        const numtyp alpha, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -66,20 +66,20 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
     if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * 
+      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
     }
@@ -119,7 +119,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
           numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
           erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
-          forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + 
+          forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd +
             rsq*f_shift-factor_coul);
         } else
           forcecoul = (numtyp)0.0;
@@ -156,19 +156,19 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
                             const __global numtyp4 *restrict lj1_in,
                             const __global numtyp4 *restrict lj3_in,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
                             __global acctyp4 *restrict ans,
-                            __global acctyp *restrict engv, 
-                            const int eflag, const int vflag, const int inum, 
+                            __global acctyp *restrict engv,
+                            const int eflag, const int vflag, const int inum,
                             const int nbor_pitch,
                             const __global numtyp *restrict q_,
                             const numtyp cut_coulsq, const numtyp qqrd2e,
-                            const numtyp e_shift, const numtyp f_shift, 
+                            const numtyp e_shift, const numtyp f_shift,
                             const numtyp alpha, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -183,7 +183,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp e_coul=(acctyp)0;
   acctyp4 f;
@@ -191,23 +191,23 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * 
+      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
     }
@@ -246,7 +246,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
           numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
           erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
-          forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + 
+          forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd +
             rsq*f_shift-factor_coul);
         } else
           forcecoul = (numtyp)0.0;
diff --git a/lib/gpu/lal_lj_dsf.h b/lib/gpu/lal_lj_dsf.h
index 5badf543c4..0195898ca4 100644
--- a/lib/gpu/lal_lj_dsf.h
+++ b/lib/gpu/lal_lj_dsf.h
@@ -30,7 +30,7 @@ class LJDSF : public BaseCharge<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,11 +40,11 @@ class LJDSF : public BaseCharge<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, double **host_lj1,
            double **host_lj2, double **host_lj3, double **host_lj4,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, double **host_cut_ljsq,
            const double host_cut_coulsq, double *host_special_coul,
-           const double qqrd2e, const double e_shift, const double f_shift, 
+           const double qqrd2e, const double e_shift, const double f_shift,
            const double alpha);
 
   /// Clear all host and device data
@@ -69,7 +69,7 @@ class LJDSF : public BaseCharge<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   numtyp _qqrd2e;
diff --git a/lib/gpu/lal_lj_dsf_ext.cpp b/lib/gpu/lal_lj_dsf_ext.cpp
index 719a792d7f..25802e7544 100644
--- a/lib/gpu/lal_lj_dsf_ext.cpp
+++ b/lib/gpu/lal_lj_dsf_ext.cpp
@@ -34,7 +34,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  const double cell_size, int &gpu_mode, FILE *screen,
                  double **host_cut_ljsq, const double host_cut_coulsq,
                  double *host_special_coul, const double qqrd2e,
-                 const double e_shift, const double f_shift, 
+                 const double e_shift, const double f_shift,
                  const double alpha) {
   LJDMF.clear();
   gpu_mode=LJDMF.device->gpu_mode();
@@ -85,7 +85,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          f_shift, alpha);
 
     LJDMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,7 +102,7 @@ void ljd_gpu_clear() {
 
 int** ljd_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                        double *sublo, double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
@@ -112,8 +112,8 @@ int** ljd_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
-}  
-			
+}
+
 void ljd_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_lj_expand.cpp b/lib/gpu/lal_lj_expand.cpp
index 03526bc095..34a4d71c0b 100644
--- a/lib/gpu/lal_lj_expand.cpp
+++ b/lib/gpu/lal_lj_expand.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ibains@nvidia.com
  ***************************************************************************/
 
@@ -36,7 +36,7 @@ template <class numtyp, class acctyp>
 LJExpandT::~LJExpand() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJExpandT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -76,11 +76,11 @@ int LJExpandT::init(const int ntypes, double **host_cutsq,
 
   lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq, host_shift);
+                         host_cutsq, host_shift);
 
   lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);
 
   UCL_H_Vec<double> dview;
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@@ -97,17 +97,17 @@ void LJExpandT::reinit(const int ntypes, double **host_cutsq,
                        double **host_lj1, double **host_lj2,
                        double **host_lj3, double **host_lj4,
                        double **host_offset, double **host_shift) {
-  
+
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2,
                          host_cutsq, host_shift);
-  
+
   this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4,
                          host_offset);
 }
@@ -146,7 +146,7 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -155,15 +155,15 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, 
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
-                          &vflag, &ainum, &nbor_pitch, 
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
diff --git a/lib/gpu/lal_lj_expand.cu b/lib/gpu/lal_lj_expand.cu
index 6b79db2323..9281ad27bd 100644
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : ibains@nvidia.com
 // ***************************************************************************/
 
@@ -26,15 +26,15 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_lj_expand(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_expand(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict lj1,
-                          const __global numtyp4 *restrict lj3, 
-                          const int lj_types, 
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
+                          const __global numtyp4 *restrict lj3,
+                          const int lj_types,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
+                          __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -52,20 +52,20 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -78,24 +78,24 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (r2inv<lj1[mtype].z) {
         numtyp r = ucl_sqrt(r2inv);
-	numtyp rshift = r - lj1[mtype].w;
-	numtyp rshiftsq = rshift*rshift;
-	r2inv = ucl_recip(rshiftsq);
+        numtyp rshift = r - lj1[mtype].w;
+        numtyp rshiftsq = rshift*rshift;
+        r2inv = ucl_recip(rshiftsq);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
         force*=factor_lj/rshift/r;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -113,15 +113,15 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp4 *restrict lj1_in,
-                               const __global numtyp4 *restrict lj3_in, 
-                               const __global numtyp *restrict sp_lj_in, 
-                               const __global int *dev_nbor, 
+                               const __global numtyp4 *restrict lj3_in,
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans, 
-                               __global acctyp *restrict engv, 
-                               const int eflag, const int vflag, const int inum, 
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
                                const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -136,30 +136,30 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(numtyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -172,23 +172,23 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       if (r2inv<lj1[mtype].z) {
         numtyp r = ucl_sqrt(r2inv);
-	numtyp rshift = r - lj1[mtype].w;
-	numtyp rshiftsq = rshift*rshift;
-	r2inv = ucl_recip(rshiftsq);
+        numtyp rshift = r - lj1[mtype].w;
+        numtyp rshiftsq = rshift*rshift;
+        r2inv = ucl_recip(rshiftsq);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
         force*=factor_lj/rshift/r;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_lj_expand.h b/lib/gpu/lal_lj_expand.h
index 0d0ae0b2e6..a732a3a686 100644
--- a/lib/gpu/lal_lj_expand.h
+++ b/lib/gpu/lal_lj_expand.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ibains@nvidia.com
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class LJExpand : public BaseAtomic<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,15 +40,15 @@ class LJExpand : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq, double **host_lj1,
            double **host_lj2, double **host_lj3, double **host_lj4,
            double **host_offset, double **host_shift, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
-  
+
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **host_cutsq,
               double **host_lj1, double **host_lj2, double **host_lj3,
               double **host_lj4, double **host_offset, double **host_shift);
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -71,7 +71,7 @@ class LJExpand : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_lj_expand_ext.cpp b/lib/gpu/lal_lj_expand_ext.cpp
index 5303149d1f..94a57192b9 100644
--- a/lib/gpu/lal_lj_expand_ext.cpp
+++ b/lib/gpu/lal_lj_expand_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ibains@nvidia.com
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ static LJExpand<PRECISION,ACC_PRECISION> LJEMF;
 int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  double **host_lj2, double **host_lj3, double **host_lj4,
                  double **offset, double **shift, double *special_lj,
-                 const int inum, const int nall, const int max_nbors, 
+                 const int inum, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size, int &gpu_mode,
                  FILE *screen) {
   LJEMF.clear();
@@ -78,7 +78,7 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          cell_size, gpu_split,screen);
 
     LJEMF.device->world_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -98,12 +98,12 @@ int lje_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
   int world_me=LJEMF.device->world_me();
   int gpu_rank=LJEMF.device->gpu_rank();
   int procs_per_gpu=LJEMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     LJEMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                 offset, shift);
   LJEMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       LJEMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
@@ -126,8 +126,8 @@ int** lje_gpu_compute_n(const int ago, const int inum_full,
   return LJEMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void lje_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_lj_ext.cpp b/lib/gpu/lal_lj_ext.cpp
index 345ed4d955..8124556d40 100644
--- a/lib/gpu/lal_lj_ext.cpp
+++ b/lib/gpu/lal_lj_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -54,7 +54,7 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+    init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                        host_lj4, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
@@ -77,7 +77,7 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                          cell_size, gpu_split, screen);
 
     LJLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -97,11 +97,11 @@ void ljl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
   int world_me=LJLMF.device->world_me();
   int gpu_rank=LJLMF.device->gpu_rank();
   int procs_per_gpu=LJLMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     LJLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset);
   LJLMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       LJLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset);
@@ -123,8 +123,8 @@ int ** ljl_gpu_compute_n(const int ago, const int inum_full,
   return LJLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void ljl_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_lj_gromacs.cpp b/lib/gpu/lal_lj_gromacs.cpp
index 75b2345378..75f5a41917 100644
--- a/lib/gpu/lal_lj_gromacs.cpp
+++ b/lib/gpu/lal_lj_gromacs.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 LJGROMACST::~LJGROMACS() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int LJGROMACST::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -47,11 +47,11 @@ template <class numtyp, class acctyp>
 int LJGROMACST::init(const int ntypes, double **host_cutsq,
                      double **host_lj1, double **host_lj2, double **host_lj3,
                      double **host_lj4, double *host_special_lj,
-                     const int nlocal, const int nall, const int max_nbors, 
-                     const int maxspecial, const double cell_size, 
+                     const int nlocal, const int nall, const int max_nbors,
+                     const int maxspecial, const double cell_size,
                      const double gpu_split, FILE *_screen,
                      double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
-                     double **host_ljsw4, double **host_ljsw5, 
+                     double **host_ljsw4, double **host_ljsw5,
                      double **cut_inner, double **cut_inner_sq) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@@ -134,7 +134,7 @@ void LJGROMACST::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -146,16 +146,16 @@ void LJGROMACST::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &ljsw,
                           &sp_lj, &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, 
+                          &this->ans->force, &this->ans->engv,
                           &eflag, &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj3, &ljsw, &_lj_types, 
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &ljsw, &_lj_types,
                      &sp_lj, &this->nbor->dev_nbor,
-                     &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, 
-                     &eflag, &vflag, &ainum, &nbor_pitch, 
+                     &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv,
+                     &eflag, &vflag, &ainum, &nbor_pitch,
                      &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/lal_lj_gromacs.cu b/lib/gpu/lal_lj_gromacs.cu
index f20d8634a5..93dc3d9456 100644
--- a/lib/gpu/lal_lj_gromacs.cu
+++ b/lib/gpu/lal_lj_gromacs.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -35,8 +35,8 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
                            __global acctyp4 *restrict ans,
-                           __global acctyp *restrict engv, 
-                           const int eflag, const int vflag, const int inum, 
+                           __global acctyp *restrict engv,
+                           const int eflag, const int vflag, const int inum,
                            const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -59,7 +59,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
@@ -83,7 +83,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
       if (rsq<lj1[mtype].z) {
         numtyp r2inv=ucl_recip(rsq);
         numtyp force_lj, force, r6inv, t;
-        
+
         r6inv = r2inv*r2inv*r2inv;
         force_lj = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
         if (rsq > lj1[mtype].w) {
@@ -91,7 +91,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
           t = r - lj3[mtype].z;
           numtyp fswitch = r*t*t*(ljsw[mtype].x + ljsw[mtype].y*t);
           force_lj += fswitch;
-        } 
+        }
 
         force = factor_lj*force_lj * r2inv;
 
@@ -149,22 +149,22 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
     lj3[tid]=lj3_in[tid];
     ljsw[tid]=ljsw_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
@@ -184,11 +184,11 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-      
+
       if (rsq<lj1[mtype].z) {
         numtyp r2inv=ucl_recip(rsq);
         numtyp force_lj, force, r6inv, t;
-        
+
         r6inv = r2inv*r2inv*r2inv;
         force_lj = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
         if (rsq > lj1[mtype].w) {
@@ -196,7 +196,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
           t = r - lj3[mtype].z;
           numtyp fswitch = r*t*t*(ljsw[mtype].x + ljsw[mtype].y*t);
           force_lj += fswitch;
-        } 
+        }
 
         force = factor_lj*force_lj * r2inv;
 
diff --git a/lib/gpu/lal_lj_gromacs.h b/lib/gpu/lal_lj_gromacs.h
index dc949be4a9..1e0f72dafc 100644
--- a/lib/gpu/lal_lj_gromacs.h
+++ b/lib/gpu/lal_lj_gromacs.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ class LJGROMACS : public BaseAtomic<numtyp, acctyp> {
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,11 +40,11 @@ class LJGROMACS : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq,
            double **host_lj1, double **host_lj2, double **host_lj3,
            double **host_lj4, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen,
            double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
-           double **host_ljsw4, double **host_ljsw5, 
+           double **host_ljsw4, double **host_ljsw5,
            double **cut_inner, double **cut_inner_sq);
 
   /// Clear all host and device data
@@ -71,7 +71,7 @@ class LJGROMACS : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_lj_gromacs_ext.cpp b/lib/gpu/lal_lj_gromacs_ext.cpp
index b5eb0038b7..53b93bfdff 100644
--- a/lib/gpu/lal_lj_gromacs_ext.cpp
+++ b/lib/gpu/lal_lj_gromacs_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,7 +33,7 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                    const int nall, const int max_nbors, const int maxspecial,
                    const double cell_size, int &gpu_mode, FILE *screen,
                    double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
-                   double **host_ljsw4, double **host_ljsw5, 
+                   double **host_ljsw4, double **host_ljsw5,
                    double **cut_inner, double **cut_inner_sq) {
   LJGRMMF.clear();
   gpu_mode=LJGRMMF.device->gpu_mode();
@@ -59,7 +59,7 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   if (world_me==0)
     LJGRMMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                  special_lj, inum, nall, 300, maxspecial, cell_size,
-                 gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3, 
+                 gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3,
                  host_ljsw4, host_ljsw5, cut_inner, cut_inner_sq);
 
   LJGRMMF.device->world_barrier();
@@ -78,11 +78,11 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     if (gpu_rank==i && world_me!=0)
       init_ok=LJGRMMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                            special_lj, inum, nall, 300, maxspecial, cell_size,
-                           gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3, 
+                           gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3,
                            host_ljsw4, host_ljsw5, cut_inner, cut_inner_sq);
 
     LJGRMMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -107,8 +107,8 @@ int ** ljgrm_gpu_compute_n(const int ago, const int inum_full,
   return LJGRMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                          subhi, tag, nspecial, special, eflag, vflag, eatom,
                          vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void ljgrm_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
@@ -118,7 +118,7 @@ void ljgrm_gpu_compute(const int ago, const int inum_full, const int nall,
                   firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
-		
+
 double ljgrm_gpu_bytes() {
   return LJGRMMF.host_memory_usage();
 }
diff --git a/lib/gpu/lal_mie.cpp b/lib/gpu/lal_mie.cpp
index 2ab7cb8d14..1510275047 100644
--- a/lib/gpu/lal_mie.cpp
+++ b/lib/gpu/lal_mie.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,17 +33,17 @@ MieT::Mie() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-MieT::~Mie() { 
+MieT::~Mie() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int MieT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int MieT::init(const int ntypes, double **host_cutsq, 
+int MieT::init(const int ntypes, double **host_cutsq,
                double **host_mie1, double **host_mie2,
                double **host_mie3, double **host_mie4,
                double **host_gamA, double **host_gamR,
@@ -76,12 +76,12 @@ int MieT::init(const int ntypes, double **host_cutsq,
 
   mie1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,mie1,host_write,host_mie1,host_mie2,
-			                   host_gamA,host_gamR);
+                                           host_gamA,host_gamR);
 
   mie3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,mie3,host_write,host_mie3,host_mie4,
-			                   host_offset,host_cutsq);
-  
+                                           host_offset,host_cutsq);
+
   UCL_H_Vec<double> dview;
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
   dview.view(host_special_lj,4,*(this->ucl_device));
@@ -126,7 +126,7 @@ void MieT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
diff --git a/lib/gpu/lal_mie.cu b/lib/gpu/lal_mie.cu
index 4d718897eb..33018566eb 100644
--- a/lib/gpu/lal_mie.cu
+++ b/lib/gpu/lal_mie.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -24,15 +24,15 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_mie(const __global numtyp4 *restrict x_, 
+__kernel void k_mie(const __global numtyp4 *restrict x_,
                     const __global numtyp4 *restrict mie1,
                     const __global numtyp4 *restrict mie3,
-                    const int lj_types, 
-                    const __global numtyp *restrict sp_lj_in, 
-                    const __global int *dev_nbor, 
-                    const __global int *dev_packed, 
+                    const int lj_types,
+                    const __global numtyp *restrict sp_lj_in,
+                    const __global int *dev_nbor,
+                    const __global int *dev_packed,
                     __global acctyp4 *restrict ans,
-                    __global acctyp *restrict engv, 
+                    __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -50,20 +50,20 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -76,7 +76,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<mie3[mtype].w) {
         numtyp r2inv = ucl_recip(rsq);
@@ -110,19 +110,19 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_mie_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_mie_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict mie1_in,
                          const __global numtyp4 *restrict mie3_in,
-                         const __global numtyp *restrict sp_lj_in, 
+                         const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
-                         const __global int *dev_packed, 
+                         const __global int *dev_packed,
                          __global acctyp4 *restrict ans,
-                         __global acctyp *restrict engv, 
-                         const int eflag, const int vflag, const int inum, 
+                         __global acctyp *restrict engv,
+                         const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 mie1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 mie3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
@@ -132,7 +132,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
     mie1[tid]=mie1_in[tid];
     mie3[tid]=mie3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -141,7 +141,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -155,7 +155,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -168,7 +168,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<mie3[mtype].w) {
         numtyp r2inv = ucl_recip(rsq);
         numtyp rgamA = pow(r2inv,(mie1[mtype].z/(numtyp)2.0));
diff --git a/lib/gpu/lal_mie.h b/lib/gpu/lal_mie.h
index e6b8efebf0..8752fe1748 100644
--- a/lib/gpu/lal_mie.h
+++ b/lib/gpu/lal_mie.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Mie : public BaseAtomic<numtyp, acctyp> {
  public:
   Mie();
-  ~Mie(); 
+  ~Mie();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -41,8 +41,8 @@ class Mie : public BaseAtomic<numtyp, acctyp> {
            double **host_mie1, double **host_mie2, double **host_mie3,
            double **host_mie4, double **host_gamA, double **host_gamR,
            double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
@@ -67,7 +67,7 @@ class Mie : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_mie_ext.cpp b/lib/gpu/lal_mie_ext.cpp
index d7c4187a42..9b03903c4f 100644
--- a/lib/gpu/lal_mie_ext.cpp
+++ b/lib/gpu/lal_mie_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -81,7 +81,7 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
                         cell_size, gpu_split, screen);
 
     MLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -106,8 +106,8 @@ int ** mie_gpu_compute_n(const int ago, const int inum_full,
   return MLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void mie_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_morse.cpp b/lib/gpu/lal_morse.cpp
index ddf7d843e6..cbdf928863 100644
--- a/lib/gpu/lal_morse.cpp
+++ b/lib/gpu/lal_morse.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -33,20 +33,20 @@ MorseT::Morse() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-MorseT::~Morse() { 
+MorseT::~Morse() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int MorseT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int MorseT::init(const int ntypes, 
-                          double **host_cutsq, double **host_morse1, 
-                          double **host_r0, double **host_alpha, 
-                          double **host_d0, double **host_offset, 
+int MorseT::init(const int ntypes,
+                          double **host_cutsq, double **host_morse1,
+                          double **host_r0, double **host_alpha,
+                          double **host_d0, double **host_offset,
                           double *host_special_lj, const int nlocal,
                           const int nall, const int max_nbors,
                           const int maxspecial, const double cell_size,
@@ -125,7 +125,7 @@ void MorseT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -135,14 +135,14 @@ void MorseT::loop(const bool _eflag, const bool _vflag) {
   if (shared_types) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->x, &mor1, &mor2, &sp_lj,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch, 
+                          &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &mor1, &mor2, &_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &mor1, &mor2, &_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
diff --git a/lib/gpu/lal_morse.cu b/lib/gpu/lal_morse.cu
index 2015c71cb2..0a14071d19 100644
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -26,13 +26,13 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_morse(const __global numtyp4 *restrict x_, 
+__kernel void k_morse(const __global numtyp4 *restrict x_,
                       const __global numtyp4 *restrict mor1,
-                      const __global numtyp2 *restrict mor2, 
-                      const int lj_types, 
-                      const __global numtyp *restrict sp_lj_in, 
-                      const __global int *dev_nbor, 
-                      const __global int *dev_packed, 
+                      const __global numtyp2 *restrict mor2,
+                      const int lj_types,
+                      const __global numtyp *restrict sp_lj_in,
+                      const __global int *dev_nbor,
+                      const __global int *dev_packed,
                       __global acctyp4 *restrict ans,
                       __global acctyp *restrict engv,
                       const int eflag, const int vflag, const int inum,
@@ -59,13 +59,13 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -78,7 +78,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (r<mor1[mtype].x) {
         r=ucl_sqrt(r);
@@ -86,14 +86,14 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
         dexp=ucl_exp(-mor1[mtype].w*dexp);
         numtyp dm=dexp*dexp-dexp;
         numtyp force = mor1[mtype].y*dm/r*factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=mor2[mtype].x*(dexp*dexp - 2.0*dexp) - mor2[mtype].y;
-          energy+=e*factor_lj; 
+          energy+=e*factor_lj;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -111,15 +111,15 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_morse_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_morse_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp4 *restrict mor1_in,
-                           const __global numtyp2 *restrict mor2_in, 
+                           const __global numtyp2 *restrict mor2_in,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
                            __global acctyp4 *restrict ans,
-                           __global acctyp *restrict engv, 
-                           const int eflag, const int vflag, const int inum, 
+                           __global acctyp *restrict engv,
+                           const int eflag, const int vflag, const int inum,
                            const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -134,30 +134,30 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
     if (eflag>0)
       mor2[tid]=mor2_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -170,21 +170,21 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r = delx*delx+dely*dely+delz*delz;
-        
+
       if (r<mor1[mtype].x) {
         r=ucl_sqrt(r);
         numtyp dexp=r-mor1[mtype].z;
         dexp=ucl_exp(-mor1[mtype].w*dexp);
         numtyp dm=dexp*dexp-dexp;
         numtyp force = mor1[mtype].y*dm/r*factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=mor2[mtype].x*(dm-dexp)-mor2[mtype].y;
-          energy+=e*factor_lj; 
+          energy+=e*factor_lj;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_morse.h b/lib/gpu/lal_morse.h
index e64852f315..ef80fb4235 100644
--- a/lib/gpu/lal_morse.h
+++ b/lib/gpu/lal_morse.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Morse : public BaseAtomic<numtyp, acctyp> {
  public:
   Morse();
-  ~Morse(); 
+  ~Morse();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,8 +40,8 @@ class Morse : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq,
            double **host_morse1, double **host_r0, double **host_alpha,
            double **host_d0, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
@@ -66,7 +66,7 @@ class Morse : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _types;
 
  private:
diff --git a/lib/gpu/lal_morse_ext.cpp b/lib/gpu/lal_morse_ext.cpp
index 3994473fd3..0338bc07a8 100644
--- a/lib/gpu/lal_morse_ext.cpp
+++ b/lib/gpu/lal_morse_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -28,9 +28,9 @@ static Morse<PRECISION,ACC_PRECISION> MORMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int mor_gpu_init(const int ntypes, double **cutsq,
-                 double **host_lj1, double **host_lj2, double **host_lj3, 
+                 double **host_lj1, double **host_lj2, double **host_lj3,
                  double **host_lj4, double **offset, double *special_lj,
-                 const int inum, const int nall, const int max_nbors, 
+                 const int inum, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size, int &gpu_mode,
                  FILE *screen) {
   MORMF.clear();
@@ -55,7 +55,7 @@ int mor_gpu_init(const int ntypes, double **cutsq,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+    init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                        host_lj4, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
@@ -78,7 +78,7 @@ int mor_gpu_init(const int ntypes, double **cutsq,
                          cell_size, gpu_split, screen);
 
     MORMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -103,8 +103,8 @@ int** mor_gpu_compute_n(const int ago, const int inum_full,
   return MORMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void mor_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp
index 074eaa842b..0a9933a6c0 100644
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@@ -10,7 +10,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov, penwang@nvidia.com
  ***************************************************************************/
 
@@ -32,13 +32,13 @@ int Neighbor::bytes_per_atom(const int max_nbors) const {
 }
 
 bool Neighbor::init(NeighborShared *shared, const int inum,
-                       const int host_inum, const int max_nbors, 
-                       const int maxspecial, UCL_Device &devi, 
-                       const int gpu_nbor, const int gpu_host, 
+                       const int host_inum, const int max_nbors,
+                       const int maxspecial, UCL_Device &devi,
+                       const int gpu_nbor, const int gpu_host,
                        const bool pre_cut, const int block_cell_2d,
                        const int block_cell_id, const int block_nbor_build,
                        const int threads_per_atom, const int warp_size,
-                       const bool time_device, 
+                       const bool time_device,
                        const std::string compile_flags) {
   clear();
 
@@ -56,10 +56,10 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
     _gpu_host=false;
   else if (gpu_host==1)
     _gpu_host=true;
-  else 
+  else
     // Not yet implemented
     assert(0==1);
-  
+
   if (pre_cut || gpu_nbor==0)
     _alloc_packed=true;
   else
@@ -71,7 +71,7 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
     _packed_permissions=UCL_READ_ONLY;
 
   bool success=true;
-    
+
   // Initialize timers for the selected GPU
   _nbor_time_avail=false;
   time_nbor.init(*dev);
@@ -88,7 +88,7 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
   _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
   if (_max_atoms==0)
     _max_atoms=1000;
-    
+
   _max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
   _max_nbors=(max_nbors/threads_per_atom+1)*threads_per_atom;
 
@@ -102,21 +102,21 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
   alloc(success);
   if (!success)
     return false;
-    
+
   if (_use_packing==false)
     _shared->compile_kernels(devi,gpu_nbor,compile_flags);
 
   return success;
 }
 
-void Neighbor::alloc(bool &success) { 
+void Neighbor::alloc(bool &success) {
   dev_nbor.clear();
   host_acc.clear();
   int nt=_max_atoms+_max_host;
-  if (_use_packing==false || _gpu_nbor>0) 
-    success=success && 
+  if (_use_packing==false || _gpu_nbor>0)
+    success=success &&
             (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev)==UCL_SUCCESS);
-  else 
+  else
     success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
                                        UCL_READ_ONLY)==UCL_SUCCESS);
   success=success && (host_acc.alloc(nt*2,*dev,
@@ -127,14 +127,17 @@ void Neighbor::alloc(bool &success) {
     dev_packed.clear();
     success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
                                          _packed_permissions)==UCL_SUCCESS);
-    _c_bytes+=dev_packed.row_bytes();                                         
-  } 
+    dev_acc.clear();
+    success=success && (dev_acc.alloc(_max_atoms,*dev,
+                                      UCL_READ_WRITE)==UCL_SUCCESS);
+    _c_bytes+=dev_packed.row_bytes()+dev_acc.row_bytes();
+  }
   if (_max_host>0) {
     nbor_host.clear();
     dev_numj_host.clear();
     host_ilist.clear();
     host_jlist.clear();
-    
+
     success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_READ_WRITE,
                              UCL_READ_WRITE)==UCL_SUCCESS) && success;
     success=success && (dev_numj_host.alloc(_max_host,*dev,
@@ -152,7 +155,7 @@ void Neighbor::alloc(bool &success) {
     for (int i=0; i<_max_host; i++) {
       host_jlist[i]=ptr;
       ptr+=_max_nbors;
-    }                                                 
+    }
     _c_bytes+=nbor_host.device.row_bytes()+dev_numj_host.row_bytes();
   } else {
     // Some OpenCL implementations return errors for NULL pointers as args
@@ -176,7 +179,7 @@ void Neighbor::alloc(bool &success) {
 
   _allocated=true;
 }
-  
+
 void Neighbor::clear() {
   _gpu_bytes=0.0;
   _cell_bytes=0.0;
@@ -194,6 +197,7 @@ void Neighbor::clear() {
 
     host_packed.clear();
     host_acc.clear();
+    dev_acc.clear();
     dev_nbor.clear();
     nbor_host.clear();
     dev_packed.clear();
@@ -219,13 +223,13 @@ double Neighbor::host_memory_usage() const {
              host_ilist.row_bytes()+host_jlist.row_bytes();
     else
       return 0;
-  } else 
+  } else
     return host_packed.row_bytes()*host_packed.rows()+host_acc.row_bytes()+
            sizeof(Neighbor);
 }
 
 void Neighbor::get_host(const int inum, int *ilist, int *numj,
-                           int **firstneigh, const int block_size) {  
+                        int **firstneigh, const int block_size) {
   _nbor_time_avail=true;
   time_nbor.start();
 
@@ -242,7 +246,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
   int dev_count=0;
   int *h_ptr=host_packed.begin();
   _nbor_pitch=inum;
-  
+
   for (int ii=0; ii<inum; ii++) {
     int i=ilist[ii];
     int nj=numj[i];
@@ -250,13 +254,13 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
     host_acc[ii+inum]=acc_count;
 
     acc_count+=nj;
-    
+
     int *jlist=firstneigh[i];
     for (int jj=0; jj<nj; jj++) {
       *h_ptr=jlist[jj];
       h_ptr++;
       ij_count++;
-       
+
       if (ij_count==IJ_SIZE) {
         dev_nbor.sync();
         host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
@@ -278,8 +282,17 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
   UCL_D_Vec<int> acc_view;
   acc_view.view_offset(inum,dev_nbor,inum*2);
   ucl_copy(acc_view,host_acc,true);
+
+  UCL_H_Vec<int> host_view;
+  host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE);
+  for (int ii=0; ii<inum; ii++) {
+    int i=ilist[ii];
+    host_view[i] = ii;
+  }
+  ucl_copy(dev_acc,host_view,true);
+
   time_nbor.stop();
-  
+
   if (_use_packing==false) {
     time_kernel.start();
     int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
@@ -294,7 +307,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
 // inum=nlocal is forced to be true to allow direct indexing of neighbors of
 // neighbors
 void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
-                         int **firstneigh, const int block_size) {  
+                         int **firstneigh, const int block_size) {
   _nbor_time_avail=true;
   time_nbor.start();
 
@@ -311,7 +324,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
   int dev_count=0;
   int *h_ptr=host_packed.begin();
   _nbor_pitch=inum;
-  
+
   if (nlist!=inum)
     host_acc.zero(inum);
 
@@ -322,7 +335,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
     host_acc[i+inum]=acc_count;
     acc_count+=nj;
   }
-  
+
   for (int i=0; i<inum; i++) {
     int nj=host_acc[i];
     int *jlist=firstneigh[i];
@@ -330,7 +343,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
       *h_ptr=jlist[jj];
       h_ptr++;
       ij_count++;
-       
+
       if (ij_count==IJ_SIZE) {
         dev_nbor.sync();
         host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
@@ -353,7 +366,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
   acc_view.view_offset(inum,dev_nbor,inum*2);
   ucl_copy(acc_view,host_acc,true);
   time_nbor.stop();
-  
+
   if (_use_packing==false) {
     time_kernel.start();
     int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
@@ -366,7 +379,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
 
 template <class numtyp, class acctyp>
 void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
-  if (maxn>_max_nbors) {  
+  if (maxn>_max_nbors) {
     int mn=static_cast<int>(static_cast<double>(maxn)*1.10);
     mn=(mn/_threads_per_atom+1)*_threads_per_atom;
     success=success && (dev_nbor.resize((mn+1)*_max_atoms)==UCL_SUCCESS);
@@ -377,7 +390,7 @@ void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
       for (int i=0; i<_max_host; i++) {
         host_jlist[i]=ptr;
         ptr+=mn;
-      }                                                 
+      }
       _gpu_bytes+=nbor_host.row_bytes();
     } else {
       nbor_host.device.view(dev_nbor);
@@ -393,8 +406,8 @@ void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
 
 template <class numtyp, class acctyp>
 void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
-                               const int nall, Atom<numtyp,acctyp> &atom, 
-                               double *sublo, double *subhi, tagint *tag, 
+                               const int nall, Atom<numtyp,acctyp> &atom,
+                               double *sublo, double *subhi, tagint *tag,
                                int **nspecial, tagint **special, bool &success,
                                int &mn) {
   _nbor_time_avail=true;
@@ -409,7 +422,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
   ncell_3d = ncellx * ncelly * ncellz;
   if (ncell_3d+1>_ncells) {
     cell_counts.clear();
-  
+
     if (_gpu_nbor==2) {
       if (_ncells>0)
         delete [] cell_iter;
@@ -419,7 +432,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
       cell_counts.device.clear();
       cell_counts.device.alloc(ncell_3d+1,dev_nbor);
     }
-    
+
     _ncells=ncell_3d+1;
     _cell_bytes=cell_counts.device.row_bytes();
   }
@@ -445,17 +458,17 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
     const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
     _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
-    _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt);        
+    _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt);
     time_transpose.stop();
   }
-  
+
   // If binning on CPU, do this now
   if (_gpu_nbor==2) {
     double stime = MPI_Wtime();
     int *cell_id=atom.host_cell_id.begin();
     int *particle_id=atom.host_particle_id.begin();
-    
-    // Build cell list on CPU                               
+
+    // Build cell list on CPU
     cell_counts.host.zero();
     double i_cell_size=1.0/_cell_size;
 
@@ -475,12 +488,12 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
       int iz = static_cast<int>(pz*i_cell_size+1);
       iz = std::max(iz,_cells_in_cutoff);
       iz = std::min(iz,ncellz-offset_hi);
-    
+
       int id = ix+iy*ncellx+iz*ncellx*ncelly;
       cell_id[i] = id;
       cell_counts[id+1]++;
     }
-    
+
     for (int i=nt; i<nall; i++) {
       double px, py, pz;
       px=x[i][0]-sublo[0];
@@ -496,12 +509,12 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
       int iz = static_cast<int>(pz*i_cell_size+1);
       iz = std::max(iz,0);
       iz = std::min(iz,ncellz-1);
-    
+
       int id = ix+iy*ncellx+iz*ncellx*ncelly;
       cell_id[i] = id;
       cell_counts[id+1]++;
     }
-    
+
     mn=0;
     for (int i=0; i<_ncells; i++)
       mn=std::max(mn,cell_counts[i]);
@@ -531,7 +544,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     ucl_copy(atom.dev_particle_id,atom.host_particle_id,true);
     time_hybrid2.stop();
     _bin_time+=MPI_Wtime()-stime;
-  }        
+  }
 
   time_kernel.start();
 
@@ -547,7 +560,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     const numtyp sublo1=static_cast<numtyp>(sublo[1]);
     const numtyp sublo2=static_cast<numtyp>(sublo[2]);
     _shared->k_cell_id.set_size(GX,neigh_block);
-    _shared->k_cell_id.run(&atom.x, &atom.dev_cell_id, 
+    _shared->k_cell_id.run(&atom.x, &atom.dev_cell_id,
                            &atom.dev_particle_id, &sublo0, &sublo1,
                            &sublo2, &i_cell_size, &ncellx, &ncelly, &ncellz,
                            &nt, &nall, &_cells_in_cutoff);
@@ -556,10 +569,10 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
 
     /* calculate cell count */
     _shared->k_cell_counts.set_size(GX,neigh_block);
-    _shared->k_cell_counts.run(&atom.dev_cell_id, &cell_counts, &nall, 
+    _shared->k_cell_counts.run(&atom.dev_cell_id, &cell_counts, &nall,
                                &ncell_3d);
-  } 
-  
+  }
+
   /* build the neighbor list */
   const int cell_block=_block_nbor_build;
   _shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)*
@@ -579,7 +592,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     host_offset.view_offset(inum,host_acc,nt-inum);
     ucl_copy(host_offset,dev_numj_host,nt-inum,true);
   }
-  
+
   if (_gpu_nbor!=2) {
     host_acc.sync();
     mn=host_acc[0];
@@ -587,7 +600,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
       mn=std::max(mn,host_acc[i]);
     set_nbor_block_size(mn);
 
-    if (mn>_max_nbors) {  
+    if (mn>_max_nbors) {
       resize_max_neighbors<numtyp,acctyp>(mn,success);
       if (!success)
         return;
@@ -599,13 +612,13 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
       return;
     }
   }
-  
+
   if (_maxspecial>0) {
     const int GX2=static_cast<int>(ceil(static_cast<double>
                                           (nt*_threads_per_atom)/cell_block));
     _shared->k_special.set_size(GX2,cell_block);
     _shared->k_special.run(&dev_nbor, &nbor_host, &dev_numj_host,
-                           &atom.dev_tag, &dev_nspecial, &dev_special, 
+                           &atom.dev_tag, &dev_nspecial, &dev_special,
                            &inum, &nt, &_max_nbors, &_threads_per_atom);
   }
   time_kernel.stop();
diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h
index 7653291bbb..05168834c6 100644
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@@ -10,7 +10,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov, penwang@nvidia.com
  ***************************************************************************/
 
@@ -28,12 +28,12 @@ class Neighbor {
  public:
   Neighbor() : _allocated(false), _use_packing(false), _ncells(0) {}
   ~Neighbor() { clear(); }
- 
+
   /// Determine whether neighbor unpacking should be used
-  /** If false, twice as much memory is reserved to allow unpacking neighbors by 
+  /** If false, twice as much memory is reserved to allow unpacking neighbors by
     * atom for coalesced access. **/
   void packing(const bool use_packing) { _use_packing=use_packing; }
-  
+
   /// Clear any old data and setup for new LAMMPS run
   /** \param inum Initial number of particles whose neighbors stored on device
     * \param host_inum Initial number of particles whose nbors copied to host
@@ -45,20 +45,20 @@ class Neighbor {
     *                 1 if gpu_nbor is true, and host needs a half nbor list,
     *                 2 if gpu_nbor is true, and host needs a full nbor list
     * \param pre_cut True if cutoff test will be performed in separate kernel
-    *                than the force kernel 
+    *                than the force kernel
     * \param threads_per_atom Number of threads used per atom for force
-    *                         calculation 
+    *                         calculation
     * \param compile_flags Flags for JIT compiling **/
   bool init(NeighborShared *shared, const int inum, const int host_inum,
             const int max_nbors, const int maxspecial, UCL_Device &dev,
             const int gpu_nbor, const int gpu_host, const bool pre_cut,
-            const int block_cell_2d, const int block_cell_id, 
+            const int block_cell_2d, const int block_cell_id,
             const int block_nbor_build, const int threads_per_atom,
-            const int warp_size, const bool time_device, 
+            const int warp_size, const bool time_device,
             const std::string compile_flags);
 
   /// Set the size of the cutoff+skin
-  inline void cell_size(const double size, const double cutoff) { 
+  inline void cell_size(const double size, const double cutoff) {
     _cell_size=size;
     _cutoff=cutoff;
     if (cutoff>size)
@@ -66,7 +66,7 @@ class Neighbor {
     else
       _cells_in_cutoff=1;
   }
-  
+
   /// Get the size of the cutoff+skin
   inline double cell_size() const { return _cell_size; }
 
@@ -88,7 +88,7 @@ class Neighbor {
     * \param host_inum Number of particles whose nbors will be copied to host
     * \param max_nbor Current max number of neighbors for a particle
     * \param success False if insufficient memory **/
-  inline void resize(const int inum, const int host_inum, const int max_nbor, 
+  inline void resize(const int inum, const int host_inum, const int max_nbor,
                      bool &success) {
     if (inum>_max_atoms || max_nbor>_max_nbors || host_inum>_max_host) {
       _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
@@ -124,40 +124,40 @@ class Neighbor {
 
   /// Free all memory on host and device
   void clear();
- 
+
   /// Bytes per atom used on device
   int bytes_per_atom(const int max_nbors) const;
-  
+
   /// Total host memory used by class
   double host_memory_usage() const;
-  
+
   /// Returns the type of neighboring:
   /** - 0 if neighboring will be performed on host
     * - 1 if neighboring will be performed on device
     * - 2 if binning on host and neighboring on device **/
   inline int gpu_nbor() const { return _gpu_nbor; }
-  
+
   /// Make a copy of unpacked nbor lists in the packed storage area (for gb)
-  inline void copy_unpacked(const int inum, const int maxj) 
+  inline void copy_unpacked(const int inum, const int maxj)
     { ucl_copy(dev_packed,dev_nbor,inum*(maxj+2),true); }
 
-  /// Copy neighbor list from host (first time or from a rebuild)  
-  void get_host(const int inum, int *ilist, int *numj, 
+  /// Copy neighbor list from host (first time or from a rebuild)
+  void get_host(const int inum, int *ilist, int *numj,
                 int **firstneigh, const int block_size);
-  
-  /// Copy neighbor list from host for 3-body (first time or from a rebuild)  
-  void get_host3(const int inum, const int nlist, int *ilist, int *numj, 
+
+  /// Copy neighbor list from host for 3-body (first time or from a rebuild)
+  void get_host3(const int inum, const int nlist, int *ilist, int *numj,
                  int **firstneigh, const int block_size);
-  
+
   /// Return the stride in elements for each nbor row
   inline int nbor_pitch() const { return _nbor_pitch; }
-  
+
   /// Return the maximum number of atoms that can currently be stored
   inline int max_atoms() const { return _max_atoms; }
 
   /// Return the maximum number of nbors for a particle based on current alloc
   inline int max_nbors() const { return _max_nbors; }
-  
+
   /// Return the time spent binning on the CPU for hybrid neighbor builds
   inline double bin_time() const { return _bin_time; }
 
@@ -171,9 +171,9 @@ class Neighbor {
 
   /// Build nbor list on the device
   template <class numtyp, class acctyp>
-  void build_nbor_list(double **x, const int inum, const int host_inum, 
+  void build_nbor_list(double **x, const int inum, const int host_inum,
                        const int nall, Atom<numtyp,acctyp> &atom, double *sublo,
-                       double *subhi, tagint *tag, int **nspecial, tagint **special, 
+                       double *subhi, tagint *tag, int **nspecial, tagint **special,
                        bool &success, int &max_nbors);
 
   /// Return the number of bytes used on device
@@ -184,7 +184,7 @@ class Neighbor {
 
     return res;
   }
-  
+
   // ------------------------------- Data -------------------------------
 
   /// Device neighbor matrix
@@ -199,6 +199,8 @@ class Neighbor {
   UCL_H_Vec<int> host_packed;
   /// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2)
   UCL_H_Vec<int> host_acc;
+  /// Device storage for accessing atom indices from the neighbor list (3-body)
+  UCL_D_Vec<int> dev_acc;
 
   // ----------------- Data for GPU Neighbor Calculation ---------------
 
@@ -219,7 +221,7 @@ class Neighbor {
 
   /// Device timers
   UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose;
-  
+
  private:
   NeighborShared *_shared;
   UCL_Device *dev;
@@ -231,14 +233,14 @@ class Neighbor {
 
   double _gpu_bytes, _c_bytes, _cell_bytes;
   void alloc(bool &success);
-  
+
   int _block_cell_2d, _block_cell_id, _max_block_nbor_build, _block_nbor_build;
   int _ncells, _threads_per_atom, _total_atoms;
   int _cells_in_cutoff;
 
   template <class numtyp, class acctyp>
   inline void resize_max_neighbors(const int maxn, bool &success);
-  
+
   int _warp_size;
   inline void set_nbor_block_size(const int mn) {
     int desired=mn/(2*_warp_size);
diff --git a/lib/gpu/lal_neighbor_cpu.cu b/lib/gpu/lal_neighbor_cpu.cu
index 384b88d9de..d005eb9f97 100644
--- a/lib/gpu/lal_neighbor_cpu.cu
+++ b/lib/gpu/lal_neighbor_cpu.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -17,7 +17,7 @@
 #include "lal_preprocessor.h"
 #endif
 
-__kernel void kernel_unpack(__global int *dev_nbor, 
+__kernel void kernel_unpack(__global int *dev_nbor,
                             const __global int *dev_ij,
                             const int inum, const int t_per_atom) {
   int tid=THREAD_ID_X;
@@ -33,7 +33,7 @@ __kernel void kernel_unpack(__global int *dev_nbor,
     list+=offset;
     nbor+=fast_mul(ii,t_per_atom-1)+offset;
     int stride=fast_mul(t_per_atom,inum);
-      
+
     for ( ; list<list_end; list++) {
       dev_nbor[nbor]=dev_ij[list];
       nbor+=stride;
diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu
index 4f8464e803..b0b3cfb90c 100644
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@@ -10,7 +10,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : penwang@nvidia.com, brownw@ornl.gov
 // ***************************************************************************/
 
@@ -32,12 +32,12 @@ texture<float4> pos_tex;
 texture<int4,1> pos_tex;
 #endif
 
-__kernel void calc_cell_id(const numtyp4 *restrict pos, 
-                           unsigned *restrict cell_id, 
+__kernel void calc_cell_id(const numtyp4 *restrict pos,
+                           unsigned *restrict cell_id,
                            int *restrict particle_id,
-                           numtyp boxlo0, numtyp boxlo1, numtyp boxlo2, 
-                           numtyp i_cell_size, int ncellx, int ncelly, 
-                           int ncellz, int inum, int nall, 
+                           numtyp boxlo0, numtyp boxlo1, numtyp boxlo2,
+                           numtyp i_cell_size, int ncellx, int ncelly,
+                           int ncellz, int inum, int nall,
                            int cells_in_cutoff) {
   int i = threadIdx.x + blockIdx.x*blockDim.x;
 
@@ -48,11 +48,11 @@ __kernel void calc_cell_id(const numtyp4 *restrict pos,
     p.x -= boxlo0;
     p.y -= boxlo1;
     p.z -= boxlo2;
-    
+
     int ix = int(p.x*i_cell_size+cells_in_cutoff);
     int iy = int(p.y*i_cell_size+cells_in_cutoff);
     int iz = int(p.z*i_cell_size+cells_in_cutoff);
-    
+
     int offset_lo, offset_hi;
     if (i<inum) {
       offset_lo=cells_in_cutoff;
@@ -61,21 +61,21 @@ __kernel void calc_cell_id(const numtyp4 *restrict pos,
       offset_lo=0;
       offset_hi=1;
     }
-    
+
     ix = max(ix,offset_lo);
     ix = min(ix,ncellx-offset_hi);
     iy = max(iy,offset_lo);
     iy = min(iy,ncelly-offset_hi);
     iz = max(iz,offset_lo);
     iz = min(iz,ncellz-offset_hi);
-    
+
     cell_id[i] = ix+iy*ncellx+iz*ncellx*ncelly;
     particle_id[i] = i;
   }
 }
 
 __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
-                                      int *restrict cell_counts, 
+                                      int *restrict cell_counts,
                                       int nall, int ncell) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < nall) {
@@ -83,18 +83,18 @@ __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
 
     // handle boundary cases
     if (idx == 0) {
-      for (int i = 0; i < id + 1; i++) 
+      for (int i = 0; i < id + 1; i++)
         cell_counts[i] = 0;
     }
     if (idx == nall - 1) {
-      for (int i = id+1; i <= ncell; i++) 
+      for (int i = id+1; i <= ncell; i++)
         cell_counts[i] = nall;
     }
 
     if (idx > 0 && idx < nall) {
       int id_l = cell_id[idx-1];
       if (id != id_l) {
-        for (int i = id_l+1; i <= id; i++) 
+        for (int i = id_l+1; i <= id; i++)
           cell_counts[i] = idx;
       }
     }
@@ -114,36 +114,36 @@ __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
 #endif
 #endif
 
-__kernel void transpose(__global tagint *restrict out, 
-                        const __global tagint *restrict in, 
+__kernel void transpose(__global tagint *restrict out,
+                        const __global tagint *restrict in,
                         int columns_in, int rows_in)
 {
-	__local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
-	
-	unsigned ti=THREAD_ID_X;
-	unsigned tj=THREAD_ID_Y;
-	unsigned bi=BLOCK_ID_X;
-	unsigned bj=BLOCK_ID_Y;
-	
-	unsigned i=bi*BLOCK_CELL_2D+ti;
-	unsigned j=bj*BLOCK_CELL_2D+tj;
-	if ((i<columns_in) && (j<rows_in))
-		block[tj][ti]=in[j*columns_in+i];
+        __local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
 
-	__syncthreads();
+        unsigned ti=THREAD_ID_X;
+        unsigned tj=THREAD_ID_Y;
+        unsigned bi=BLOCK_ID_X;
+        unsigned bj=BLOCK_ID_Y;
 
-	i=bj*BLOCK_CELL_2D+ti;
-	j=bi*BLOCK_CELL_2D+tj;
-	if ((i<rows_in) && (j<columns_in))
-		out[j*rows_in+i] = block[ti][tj];
+        unsigned i=bi*BLOCK_CELL_2D+ti;
+        unsigned j=bj*BLOCK_CELL_2D+tj;
+        if ((i<columns_in) && (j<rows_in))
+                block[tj][ti]=in[j*columns_in+i];
+
+        __syncthreads();
+
+        i=bj*BLOCK_CELL_2D+ti;
+        j=bi*BLOCK_CELL_2D+tj;
+        if ((i<rows_in) && (j<columns_in))
+                out[j*rows_in+i] = block[ti][tj];
 }
 
-__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, 
-                                   const __global int *restrict cell_particle_id, 
-                                   const __global int *restrict cell_counts, 
+__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
+                                   const __global int *restrict cell_particle_id,
+                                   const __global int *restrict cell_counts,
                                    __global int *nbor_list,
-                                   __global int *host_nbor_list, 
-                                   __global int *host_numj, 
+                                   __global int *host_nbor_list,
+                                   __global int *host_numj,
                                    int neigh_bin_size, numtyp cell_size,
                                    int ncellx, int ncelly, int ncellz,
                                    int inum, int nt, int nall, int t_per_atom,
@@ -154,7 +154,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
   int iy = BLOCK_ID_Y % (ncelly - cells_in_cutoff*2) + cells_in_cutoff;
   int iz = BLOCK_ID_Y / (ncelly - cells_in_cutoff*2) + cells_in_cutoff;
   int bsx = BLOCK_SIZE_X;
-	  
+
   int icell = ix + iy*ncellx + iz*ncellx*ncelly;
 
   __local int cell_list_sh[BLOCK_NBOR_BUILD];
@@ -163,7 +163,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
   int icell_begin = cell_counts[icell];
   int icell_end = cell_counts[icell+1];
 
-  int nborz0 = iz-cells_in_cutoff, nborz1 = iz+cells_in_cutoff, 
+  int nborz0 = iz-cells_in_cutoff, nborz1 = iz+cells_in_cutoff,
       nbory0 = iy-cells_in_cutoff, nbory1 = iy+cells_in_cutoff,
       nborx0 = ix-cells_in_cutoff, nborx1 = ix+cells_in_cutoff;
 
@@ -174,9 +174,9 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
     int i = icell_begin + tid + ii*bsx;
     int pid_i = nall, pid_j, stride;
     numtyp4 atom_i, atom_j;
-    int cnt = 0;    
+    int cnt = 0;
     __global int *neigh_counts, *neigh_list;
-    
+
     if (i < icell_end)
       pid_i = cell_particle_id[i];
 
@@ -191,28 +191,28 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
       nbor_list[pid_i]=pid_i;
     } else {
       stride=0;
-    	neigh_counts=host_numj+pid_i-inum;
+            neigh_counts=host_numj+pid_i-inum;
       neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
     }
-    
+
     // loop through neighbors
 
     for (int nborz = nborz0; nborz <= nborz1; nborz++) {
       for (int nbory = nbory0; nbory <= nbory1; nbory++) {
         for (int nborx = nborx0; nborx <= nborx1; nborx++) {
-	
+
           int jcell = nborx + nbory*ncellx + nborz*ncellx*ncelly;
-		
+
           int jcell_begin = cell_counts[jcell];
           int jcell_end = cell_counts[jcell+1];
           int num_atom_cell = jcell_end - jcell_begin;
-	  
+
           // load jcell to shared memory
           int num_iter = ucl_ceil((numtyp)num_atom_cell/bsx);
 
           for (int k = 0; k < num_iter; k++) {
             int end_idx = min(bsx, num_atom_cell-k*bsx);
-	    
+
             if (tid < end_idx) {
               pid_j =  cell_particle_id[tid+k*bsx+jcell_begin];
               cell_list_sh[tid] = pid_j;
@@ -222,15 +222,15 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
               pos_sh[tid].z = atom_j.z;
             }
             __syncthreads();
-	    
+
             if (pid_i < nt) {
-	    
+
               for (int j = 0; j < end_idx; j++) {
                 int pid_j = cell_list_sh[j]; // gather from shared memory
                 diff.x = atom_i.x - pos_sh[j].x;
                 diff.y = atom_i.y - pos_sh[j].y;
                 diff.z = atom_i.z - pos_sh[j].z;
-		
+
                 r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
                 if (r2 < cell_size*cell_size && r2 > 1e-5) {
                   cnt++;
@@ -240,11 +240,11 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
                     if ((cnt & (t_per_atom-1))==0)
                       neigh_list=neigh_list+stride;
                   }
-                }		
+                }
               }
             }
-	          __syncthreads();
-	        } // for (k)
+                  __syncthreads();
+                } // for (k)
         }
       }
     }
@@ -253,11 +253,11 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
   } // for (i)
 }
 
-__kernel void kernel_special(__global int *dev_nbor, 
-                             __global int *host_nbor_list, 
-                             const __global int *host_numj, 
+__kernel void kernel_special(__global int *dev_nbor,
+                             __global int *host_nbor_list,
+                             const __global int *host_numj,
                              const __global tagint *restrict tag,
-                             const __global int *restrict nspecial, 
+                             const __global int *restrict nspecial,
                              const __global tagint *restrict special,
                              int inum, int nt, int max_nbors, int t_per_atom) {
   int tid=THREAD_ID_X;
@@ -268,7 +268,7 @@ __kernel void kernel_special(__global int *dev_nbor,
   if (ii<nt) {
     int stride;
     __global int *list, *list_end;
-    
+
     int n1=nspecial[ii*3];
     int n2=nspecial[ii*3+1];
     int n3=nspecial[ii*3+2];
@@ -289,7 +289,7 @@ __kernel void kernel_special(__global int *dev_nbor,
       numj=host_numj[ii-inum];
       list_end=list+fast_mul(numj,stride);
     }
-  
+
     for ( ; list<list_end; list+=stride) {
       int nbor=*list;
       tagint jtag=tag[nbor];
diff --git a/lib/gpu/lal_neighbor_shared.cpp b/lib/gpu/lal_neighbor_shared.cpp
index d5d37883cb..f1458b35be 100644
--- a/lib/gpu/lal_neighbor_shared.cpp
+++ b/lib/gpu/lal_neighbor_shared.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -51,8 +51,8 @@ void NeighborShared::clear() {
 void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor,
                                      const std::string flags) {
   if (_compiled)
-  	return;
-  	
+          return;
+
   _gpu_nbor=gpu_nbor;
   if (_gpu_nbor==0) {
     nbor_program=new UCL_Program(dev);
diff --git a/lib/gpu/lal_neighbor_shared.h b/lib/gpu/lal_neighbor_shared.h
index 31d74b0fa6..834ee8406d 100644
--- a/lib/gpu/lal_neighbor_shared.h
+++ b/lib/gpu/lal_neighbor_shared.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -36,7 +36,7 @@ class NeighborShared {
  public:
   NeighborShared() : _compiled(false) {}
   ~NeighborShared() { clear(); }
- 
+
   /// Free all memory on host and device
   void clear();
 
@@ -44,7 +44,7 @@ class NeighborShared {
   UCL_Texture neigh_tex;
 
   /// Compile kernels for neighbor lists
-  void compile_kernels(UCL_Device &dev, const int gpu_nbor, 
+  void compile_kernels(UCL_Device &dev, const int gpu_nbor,
                        const std::string flags);
 
   // ----------------------------- Kernels
diff --git a/lib/gpu/lal_pppm.cpp b/lib/gpu/lal_pppm.cpp
index 3685cf9a89..fefa1172ab 100644
--- a/lib/gpu/lal_pppm.cpp
+++ b/lib/gpu/lal_pppm.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -54,7 +54,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
                               const int nylo_out, const int nzlo_out,
                               const int nxhi_out, const int nyhi_out,
                               const int nzhi_out, grdtyp **rho_coeff,
-                              grdtyp **vd_brick_p, const double slab_volfactor, 
+                              grdtyp **vd_brick_p, const double slab_volfactor,
                               const int nx_pppm, const int ny_pppm,
                               const int nz_pppm, const bool split, int &flag) {
   _max_bytes=10;
@@ -101,7 +101,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
   _allocated=true;
   _max_bytes=0;
   _max_an_bytes=ans->gpu_bytes();
-  
+
   _order=order;
   _order_m_1=order-1;
   _order2=_order_m_1*_order;
@@ -130,7 +130,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
   view.view(rho_coeff[0]+n2lo,numel,*ucl_device);
   ucl_copy(d_rho_coeff,view,true);
   _max_bytes+=d_rho_coeff.row_bytes();
-  
+
   // Allocate storage for grid
   _npts_x=nxhi_out-nxlo_out+1;
   _npts_y=nyhi_out-nylo_out+1;
@@ -165,10 +165,10 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
     flag=-3;
     return 0;
   }
-  
+
   error_flag.device.zero();
   _max_bytes+=1;
-  
+
   _cpu_idle_time=0.0;
 
   return brick.host.begin();
@@ -180,13 +180,13 @@ void PPPMT::clear(const double cpu_time) {
     return;
   _allocated=false;
   _precompute_done=false;
-  
+
   brick.clear();
   vd_brick.clear();
   d_brick_counts.clear();
   error_flag.clear();
   d_brick_atoms.clear();
-  
+
   acc_timers();
   device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp,
                               *ans,_max_bytes+_max_an_bytes,cpu_time,
@@ -216,7 +216,7 @@ void PPPMT::clear(const double cpu_time) {
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
 void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
                                  double **host_x, int *host_type, bool &success,
-                                 double *host_q, double *boxlo, 
+                                 double *host_q, double *boxlo,
                                  const double delxinv, const double delyinv,
                                  const double delzinv) {
   acc_timers();
@@ -224,7 +224,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
     zero_timers();
     return;
   }
-  
+
   ans->inum(nlocal);
 
   if (ago==0) {
@@ -250,7 +250,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
   int ainum=this->ans->inum();
-  
+
   // Boxlo adjusted to be upper left brick and shift for even spline order
   double shift=0.0;
   if (_order % 2)
@@ -258,7 +258,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
   _brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv;
   _brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv;
   _brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv;
-  
+
   _delxinv=delxinv;
   _delyinv=delyinv;
   _delzinv=delzinv;
@@ -268,7 +268,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
   device->zero(d_brick_counts,d_brick_counts.numel());
   k_particle_map.set_size(GX,BX);
   k_particle_map.run(&atom->x, &atom->q, &f_delvolinv, &ainum,
-                     &d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y, 
+                     &d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y,
                      &_brick_z, &_delxinv, &_delyinv, &_delzinv, &_nlocal_x,
                      &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms,
                      &error_flag);
@@ -299,7 +299,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
 template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
 int PPPMT::spread(const int ago, const int nlocal, const int nall,
                            double **host_x, int *host_type, bool &success,
-                           double *host_q, double *boxlo, 
+                           double *host_q, double *boxlo,
                            const double delxinv, const double delyinv,
                            const double delzinv) {
   if (_precompute_done==false) {
@@ -309,10 +309,10 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall,
   }
 
   device->stop_host_timer();
-  
+
   if (!success || nlocal==0)
     return 0;
-    
+
   double t=MPI_Wtime();
   time_out.sync_stop();
   _cpu_idle_time+=MPI_Wtime()-t;
@@ -325,10 +325,10 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall,
     error_flag.device.zero();
     d_brick_atoms.resize(_atom_stride*_max_brick_atoms);
     _max_bytes+=d_brick_atoms.row_bytes();
-    return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, 
+    return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
                   delxinv,delyinv,delzinv);
   }
-  
+
   return error_flag[0];
 }
 
@@ -340,18 +340,18 @@ void PPPMT::interp(const grdtyp qqrd2e_scale) {
   time_in.start();
   vd_brick.update_device(true);
   time_in.stop();
-  
+
   time_interp.start();
   // Compute the block size and grid size to keep all cores busy
   int BX=this->block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
   int ainum=this->ans->inum();
-  
+
   k_interp.set_size(GX,BX);
   k_interp.run(&atom->x, &atom->q, &ainum, &vd_brick, &d_rho_coeff,
                &_npts_x, &_npts_yx, &_brick_x, &_brick_y, &_brick_z, &_delxinv,
-               &_delyinv, &_delzinv, &_order, &_order2, &qqrd2e_scale, 
+               &_delyinv, &_delzinv, &_order, &_order2, &qqrd2e_scale,
                &ans->force);
   time_interp.stop();
 
@@ -381,7 +381,7 @@ void PPPMT::compile_kernels(UCL_Device &dev) {
   #endif
 
   pppm_program=new UCL_Program(dev);
-  
+
   #ifdef USE_OPENCL
   pppm_program->load_string(pppm,flags.c_str());
   #else
diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu
index 99fe655dfd..24636b9a93 100644
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
@@ -48,17 +48,17 @@ texture<int2> q_tex;
 // Number of pencils per block for charge spread
 #define BLOCK_PENCILS (PPPM_BLOCK_1D/PENCIL_SIZE)
 
-__kernel void particle_map(const __global numtyp4 *restrict x_,  
+__kernel void particle_map(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict q_,
-                           const grdtyp delvolinv, const int nlocal, 
-                           __global int *restrict counts, 
-                           __global grdtyp4 *restrict ans, 
+                           const grdtyp delvolinv, const int nlocal,
+                           __global int *restrict counts,
+                           __global grdtyp4 *restrict ans,
                            const grdtyp b_lo_x, const grdtyp b_lo_y,
                            const grdtyp b_lo_z, const grdtyp delxinv,
                            const grdtyp delyinv, const grdtyp delzinv,
                            const int nlocal_x, const int nlocal_y,
                            const int nlocal_z, const int atom_stride,
-                           const int max_atoms, 
+                           const int max_atoms,
                            __global int *restrict error) {
   // ii indexes the two interacting particles in gi
   int ii=GLOBAL_ID_X;
@@ -76,7 +76,7 @@ __kernel void particle_map(const __global numtyp4 *restrict x_,
     grdtyp4 delta;
     fetch(delta.w,ii,q_tex);
     delta.w*=delvolinv;
-    
+
     if (delta.w!=(grdtyp)0.0) {
       delta.x=(p.x-b_lo_x)*delxinv;
       nx=delta.x;
@@ -85,14 +85,14 @@ __kernel void particle_map(const __global numtyp4 *restrict x_,
       delta.z=(p.z-b_lo_z)*delzinv;
       nz=delta.z;
 
-      if (delta.x<(grdtyp)0 || delta.y<(grdtyp)0 || delta.z<(grdtyp)0 || 
+      if (delta.x<(grdtyp)0 || delta.y<(grdtyp)0 || delta.z<(grdtyp)0 ||
           nx>=nlocal_x || ny>=nlocal_y || nz>=nlocal_z)
         *error=1;
       else {
         delta.x=nx+(grdtyp)0.5-delta.x;
         delta.y=ny+(grdtyp)0.5-delta.y;
         delta.z=nz+(grdtyp)0.5-delta.z;
-      
+
         int i=nz*nlocal_y*nlocal_x+ny*nlocal_x+nx;
         int old=atom_add(counts+i, 1);
         if (old>=max_atoms) {
@@ -107,9 +107,9 @@ __kernel void particle_map(const __global numtyp4 *restrict x_,
 
 /* --------------------------- */
 
-__kernel void make_rho(const __global int *restrict counts, 
+__kernel void make_rho(const __global int *restrict counts,
                        const __global grdtyp4 *restrict atoms,
-                       __global grdtyp *restrict brick, 
+                       __global grdtyp *restrict brick,
                        const __global grdtyp *restrict _rho_coeff,
                        const int atom_stride, const int npts_x,
                        const int npts_y, const int npts_z, const int nlocal_x,
@@ -118,15 +118,15 @@ __kernel void make_rho(const __global int *restrict counts,
   __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
   __local grdtyp front[BLOCK_PENCILS][PENCIL_SIZE+PPPM_MAX_SPLINE];
   __local grdtyp ans[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
-  
+
   int tid=THREAD_ID_X;
   if (tid<order2+order)
     rho_coeff[tid]=_rho_coeff[tid];
-    
+
   int pid=tid/PENCIL_SIZE;
   int fid=tid%PENCIL_SIZE;
   int fid_halo=PENCIL_SIZE+fid;
-  if (fid<order) 
+  if (fid<order)
     front[pid][fid_halo]=(grdtyp)0.0;
 
   __syncthreads();
@@ -163,7 +163,7 @@ __kernel void make_rho(const __global int *restrict counts,
           int natoms=fast_mul(counts[pos],atom_stride);
           for (int row=pos; row<natoms; row+=atom_stride) {
             grdtyp4 delta=atoms[row];
-      
+
             grdtyp rho1d_1=(grdtyp)0.0;
             grdtyp rho1d_2=(grdtyp)0.0;
             for (int k=order2+order-1; k > -1; k-=order) {
@@ -184,14 +184,14 @@ __kernel void make_rho(const __global int *restrict counts,
         z_pos+=z_stride;
       }
     }
-    
+
     __syncthreads();
     if (fid<order) {
       front[pid][fid]=front[pid][fid_halo];
       front[pid][fid_halo]=(grdtyp)0.0;
-    } else 
+    } else
       front[pid][fid]=(grdtyp)0.0;
-    
+
     for (int n=0; n<order; n++) {
       front[pid][fid+n]+=ans[n][tid];
       __syncthreads();
@@ -204,16 +204,16 @@ __kernel void make_rho(const __global int *restrict counts,
   }
 }
 
-__kernel void interp(const __global numtyp4 *restrict x_, 
+__kernel void interp(const __global numtyp4 *restrict x_,
                      const __global numtyp *restrict q_,
-                     const int nlocal, 
+                     const int nlocal,
                      const __global grdtyp4 *restrict brick,
-                     const __global grdtyp *restrict _rho_coeff, 
+                     const __global grdtyp *restrict _rho_coeff,
                      const int npts_x, const int npts_yx, const grdtyp b_lo_x,
                      const grdtyp b_lo_y, const grdtyp b_lo_z,
                      const grdtyp delxinv,  const grdtyp delyinv,
                      const grdtyp delzinv, const int order,
-                     const int order2, const grdtyp qqrd2e_scale, 
+                     const int order2, const grdtyp qqrd2e_scale,
                      __global acctyp4 *restrict ans) {
   __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
   __local grdtyp rho1d_0[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
@@ -223,9 +223,9 @@ __kernel void interp(const __global numtyp4 *restrict x_,
   if (tid<order2+order)
     rho_coeff[tid]=_rho_coeff[tid];
   __syncthreads();
-  
+
   int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
-  
+
   int nx,ny,nz;
   grdtyp tx,ty,tz;
 
@@ -260,7 +260,7 @@ __kernel void interp(const __global numtyp4 *restrict x_,
           rho1d_1[k][tid]=rho_coeff[l]+rho1d_1[k][tid]*dy;
         }
       }
-        
+
       int mz=fast_mul(nz,npts_yx)+nx;
       for (int n=0; n<order; n++) {
         grdtyp rho1d_2=(grdtyp)0.0;
@@ -270,19 +270,19 @@ __kernel void interp(const __global numtyp4 *restrict x_,
         int my=mz+fast_mul(ny,npts_x);
         for (int m=0; m<order; m++) {
           grdtyp y0=z0*rho1d_1[m][tid];
-  	      for (int l=0; l<order; l++) {
-  	        grdtyp x0=y0*rho1d_0[l][tid];
-  	        grdtyp4 el=brick[my+l];
-  	        ek.x-=x0*el.x;
-  	        ek.y-=x0*el.y;
-  	        ek.z-=x0*el.z;
-  	      }
+                for (int l=0; l<order; l++) {
+                  grdtyp x0=y0*rho1d_0[l][tid];
+                  grdtyp4 el=brick[my+l];
+                  ek.x-=x0*el.x;
+                  ek.y-=x0*el.y;
+                  ek.z-=x0*el.z;
+                }
           my+=npts_x;
         }
         mz+=npts_yx;
-  	  }
+            }
     }
     ans[ii]=ek;
-	}
+        }
 }
 
diff --git a/lib/gpu/lal_pppm.h b/lib/gpu/lal_pppm.h
index 3b5809ea6c..045423e079 100644
--- a/lib/gpu/lal_pppm.h
+++ b/lib/gpu/lal_pppm.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -48,9 +48,9 @@ class PPPM {
   grdtyp * init(const int nlocal, const int nall, FILE *screen, const int order,
                 const int nxlo_out, const int nylo_out, const int nzlo_out,
                 const int nxhi_out, const int nyhi_out, const int nzhi_out,
-                grdtyp **rho_coeff, grdtyp **vd_brick, 
-                const double slab_volfactor, const int nx_pppm, 
-                const int ny_pppm, const int nz_pppm, const bool split, 
+                grdtyp **rho_coeff, grdtyp **vd_brick,
+                const double slab_volfactor, const int nx_pppm,
+                const int ny_pppm, const int nz_pppm, const bool split,
                 int &success);
 
   /// Check if there is enough storage for atom arrays and realloc if not
@@ -66,7 +66,7 @@ class PPPM {
   /// Check if there is enough storage for local atoms and realloc if not
   inline void resize_local(const int inum, bool &success) {
   }
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear(const double cpu_time);
@@ -118,7 +118,7 @@ class PPPM {
 
   void interp(const grdtyp qqrd2e_scale);
 
-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
   Device<numtyp,acctyp> *device;
@@ -142,21 +142,21 @@ class PPPM {
 
   UCL_Vector<grdtyp,grdtyp> brick;
   UCL_Vector<grdtyp,grdtyp> vd_brick;
-  
+
   // Count of number of atoms assigned to each grid point
   UCL_D_Vec<int> d_brick_counts;
   // Atoms assigned to each grid point
   UCL_D_Vec<grdtyp4> d_brick_atoms;
-  
+
   // Error checking for out of bounds atoms
   UCL_Vector<int,int> error_flag;
-  
+
   // Number of grid points in brick (including ghost)
   int _npts_x, _npts_y, _npts_z, _npts_yx;
-  
+
   // Number of local grid points in brick
   int _nlocal_x, _nlocal_y, _nlocal_z, _nlocal_yx, _atom_stride;
-  
+
   // -------------------------- SPLINE DATA -------------------------
   UCL_D_Vec<grdtyp> d_rho_coeff;
   int _order, _nlower, _nupper, _order_m_1, _order2;
@@ -180,12 +180,12 @@ class PPPM {
   int _block_size, _block_pencils, _pencil_size, _max_brick_atoms, _max_atoms;
   double  _max_bytes, _max_an_bytes;
   double _cpu_idle_time;
-  
-  grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv; 
+
+  grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv;
 
   double _slab_volfactor;
   int _nx_pppm, _ny_pppm, _nz_pppm;
-  
+
   void compile_kernels(UCL_Device &dev);
   void _precompute(const int ago, const int nlocal, const int nall,
                    double **host_x, int *host_type, bool &success,
diff --git a/lib/gpu/lal_pppm_ext.cpp b/lib/gpu/lal_pppm_ext.cpp
index 6e5a82af5b..7e07d6c87b 100644
--- a/lib/gpu/lal_pppm_ext.cpp
+++ b/lib/gpu/lal_pppm_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ static PPPM<PRECISION,ACC_PRECISION,double,_lgpu_double4> PPPMD;
 // ---------------------------------------------------------------------------
 template <class grdtyp, class memtyp>
 grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
-                       FILE *screen, const int order, const int nxlo_out, 
+                       FILE *screen, const int order, const int nxlo_out,
                        const int nylo_out, const int nzlo_out,
                        const int nxhi_out, const int nyhi_out,
                        const int nzhi_out, grdtyp **rho_coeff,
@@ -82,7 +82,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
                            split,success);
 
     pppm.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -91,7 +91,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
 }
 
 float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
-                        const int order, const int nxlo_out, 
+                        const int order, const int nxlo_out,
                         const int nylo_out, const int nzlo_out,
                         const int nxhi_out, const int nyhi_out,
                         const int nzhi_out, float **rho_coeff,
@@ -102,7 +102,7 @@ float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
                          nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
                          slab_volfactor,nx_pppm,ny_pppm,nz_pppm,split,success);
   if (split==false && respa==false)
-    PPPMF.device->set_single_precompute(&PPPMF);                         
+    PPPMF.device->set_single_precompute(&PPPMF);
   return b;
 }
 
@@ -133,20 +133,20 @@ void pppm_gpu_forces_f(double **f) {
 }
 
 double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
-                         const int order, const int nxlo_out, 
+                         const int order, const int nxlo_out,
                          const int nylo_out, const int nzlo_out,
                          const int nxhi_out, const int nyhi_out,
                          const int nzhi_out, double **rho_coeff,
                          double **vd_brick, const double slab_volfactor,
                          const int nx_pppm, const int ny_pppm,
-                         const int nz_pppm, const bool split, 
+                         const int nz_pppm, const bool split,
                          const bool respa, int &success) {
   double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out,
                           nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
                           vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
-                          split,success);                        
+                          split,success);
   if (split==false && respa==false)
-    PPPMD.device->set_double_precompute(&PPPMD);                         
+    PPPMD.device->set_double_precompute(&PPPMD);
   return b;
 }
 
diff --git a/lib/gpu/lal_precision.h b/lib/gpu/lal_precision.h
index 24f5b937f7..d5b1b9b6c0 100644
--- a/lib/gpu/lal_precision.h
+++ b/lib/gpu/lal_precision.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
@@ -49,17 +49,17 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_float2 &v) {
   out << v.x << " " << v.y;
   return out;
 }
-  
+
 inline std::ostream & operator<<(std::ostream &out, const _lgpu_float4 &v) {
   out << v.x << " " << v.y << " " << v.z;
   return out;
 }
-  
+
 inline std::ostream & operator<<(std::ostream &out, const _lgpu_double2 &v) {
   out << v.x << " " << v.y;
   return out;
 }
-  
+
 inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
   out << v.x << " " << v.y << " " << v.z;
   return out;
@@ -115,6 +115,14 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 #define OCL_DEFAULT_VENDOR "generic"
 #endif
 
+#ifdef INTEL_OCL
+#define OCL_DEFAULT_VENDOR "intel"
+#endif
+
+#ifdef PHI_OCL
+#define OCL_DEFAULT_VENDOR "phi"
+#endif
+
 #ifndef OCL_DEFAULT_VENDOR
 #define OCL_DEFAULT_VENDOR "none"
 #endif
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index 9dbb3c5944..69a8e61bd4 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -9,16 +9,16 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
 //*************************************************************************
 //                           Preprocessor Definitions
-//                           
+//
 //  Note: It is assumed that constants with the same names are defined with
 //  the same values in all files.
-//  
+//
 //  ARCH
 //     Definition:   Architecture number for accelerator
 //  MEM_THREADS
@@ -35,22 +35,22 @@
 //     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
 //  PPPM_MAX_SPLINE
 //     Definition:   Maximum order for splines in PPPM
-//  PPPM_BLOCK_1D    
+//  PPPM_BLOCK_1D
 //     Definition:   Thread block size for PPPM kernels
 //     Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
-//                   PPPM_BLOCK_1D%32==0 
+//                   PPPM_BLOCK_1D%32==0
 //  BLOCK_PAIR
 //     Definition:   Default thread block size for pair styles
 //     Restrictions:
 //  MAX_SHARED_TYPES 8
 //     Definition:   Max # of atom type params can be stored in shared memory
 //     Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR
-//  BLOCK_CELL_2D 
+//  BLOCK_CELL_2D
 //     Definition:   Default block size in each dimension for cell list builds
 //                   and matrix transpose
-//  BLOCK_CELL_ID    
+//  BLOCK_CELL_ID
 //     Definition:   Default block size for binning atoms in cell list builds
-//  BLOCK_NBOR_BUILD 
+//  BLOCK_NBOR_BUILD
 //     Definition:   Default block size for neighbor list builds
 //  BLOCK_BIO_PAIR
 //     Definition:   Default thread block size for "bio" pair styles
@@ -78,10 +78,10 @@
 #define BLOCK_SIZE_Y blockDim.y
 #define __kernel extern "C" __global__
 #define __local __shared__
-#define __global  
+#define __global
 #define restrict __restrict__
 #define atom_add atomicAdd
-#define ucl_inline static __inline__ __device__ 
+#define ucl_inline static __inline__ __device__
 
 #ifdef __CUDA_ARCH__
 #define ARCH __CUDA_ARCH__
diff --git a/lib/gpu/lal_re_squared.cpp b/lib/gpu/lal_re_squared.cpp
index cbf50fab7d..9513f5a633 100644
--- a/lib/gpu/lal_re_squared.cpp
+++ b/lib/gpu/lal_re_squared.cpp
@@ -37,18 +37,18 @@ RESquaredT::RESquared() : BaseEllipsoid<numtyp,acctyp>(),
 }
 
 template <class numtyp, class acctyp>
-RESquaredT::~RESquared() { 
+RESquaredT::~RESquared() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int RESquaredT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int RESquaredT::init(const int ntypes, double **host_shape, double **host_well, 
-                     double **host_cutsq, double **host_sigma, 
+int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
+                     double **host_cutsq, double **host_sigma,
                      double **host_epsilon, int **h_form, double **host_lj1,
                      double **host_lj2, double **host_lj3, double **host_lj4,
                      double **host_offset, const double *host_special_lj,
@@ -81,23 +81,23 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
 
   sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
-			 host_sigma,host_epsilon);
+                         host_sigma,host_epsilon);
 
   this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write,
-			 host_cutsq,h_form);
+                         host_cutsq,h_form);
 
   lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq,h_form);
+                         host_cutsq,h_form);
 
   lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);
 
   dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
   dev_error.zero();
-    
+
   // Allocate, cast and asynchronous memcpy of constant data
   // Copy data for bonded interactions
   special_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@@ -127,7 +127,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
   }
   view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device));
   ucl_copy(well,view4,false);
-  
+
   _allocated=true;
   this->_max_bytes=sigma_epsilon.row_bytes()+this->cut_form.row_bytes()+
                    lj1.row_bytes()+lj3.row_bytes()+special_lj.row_bytes()+
@@ -144,7 +144,7 @@ void RESquaredT::clear() {
   UCL_H_Vec<int> err_flag(1,*(this->ucl_device));
   ucl_copy(err_flag,dev_error,false);
   if (err_flag[0] == 2)
-    std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";  
+    std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";
   err_flag.clear();
 
   _allocated=false;
@@ -158,7 +158,7 @@ void RESquaredT::clear() {
   shape.clear();
   well.clear();
   special_lj.clear();
-  
+
   this->clear_base();
 }
 
@@ -184,7 +184,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=0, NGX;
   int stride=this->nbor->nbor_pitch();
   int ainum=this->ans->inum();
@@ -197,34 +197,34 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
                                (BX/this->_threads_per_atom)));
       NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX));
       this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_ELLIPSE,
-			                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
+                                         ELLIPSE_ELLIPSE,_shared_types,_lj_types);
       this->time_nbor1.stop();
 
       this->time_ellipsoid.start();
       this->k_ellipsoid.set_size(GX,BX);
       this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
                             &this->shape, &this->well, &this->special_lj,
-                            &this->sigma_epsilon, &this->_lj_types, 
-                            &this->nbor->dev_nbor, &stride, 
+                            &this->sigma_epsilon, &this->_lj_types,
+                            &this->nbor->dev_nbor, &stride,
                             &this->ans->force,&ainum, &this->ans->engv,
-                            &this->dev_error, &eflag, &vflag, 
+                            &this->dev_error, &eflag, &vflag,
                             &this->_last_ellipse, &this->_threads_per_atom);
       this->time_ellipsoid.stop();
 
       // ------------ ELLIPSE_SPHERE ---------------
       this->time_nbor2.start();
       this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
-			                 ELLIPSE_SPHERE,_shared_types,_lj_types);
+                                         ELLIPSE_SPHERE,_shared_types,_lj_types);
       this->time_nbor2.stop();
 
       this->time_ellipsoid2.start();
       this->k_ellipsoid_sphere.set_size(GX,BX);
-      this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat, 
+      this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat,
                                    &this->shape, &this->well, &this->special_lj,
-                                   &this->sigma_epsilon, &this->_lj_types, 
+                                   &this->sigma_epsilon, &this->_lj_types,
                                    &this->nbor->dev_nbor, &stride,
                                    &this->ans->force,&ainum,
-                                   &this->ans->engv, &this->dev_error, 
+                                   &this->ans->engv, &this->dev_error,
                                    &eflag, &vflag, &this->_last_ellipse,
                                    &this->_threads_per_atom);
       this->time_ellipsoid2.stop();
@@ -245,18 +245,18 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
                                this->_last_ellipse)/BX));
       this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(),
-			                 SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
+                                         SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
       this->time_nbor3.stop();
 
       this->time_ellipsoid3.start();
       this->k_sphere_ellipsoid.set_size(GX,BX);
       this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
-                                   &this->shape, &this->well, &this->special_lj, 
+                                   &this->shape, &this->well, &this->special_lj,
                                    &this->sigma_epsilon, &this->_lj_types,
-                                   &this->nbor->dev_nbor, &stride, 
+                                   &this->nbor->dev_nbor, &stride,
                                    &this->ans->force, &this->ans->engv,
                                    &this->dev_error, &eflag, &vflag,
-                                   &this->_last_ellipse, &ainum, 
+                                   &this->_last_ellipse, &ainum,
                                    &this->_threads_per_atom);
       this->time_ellipsoid3.stop();
    } else {
@@ -266,13 +266,13 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       this->ans->force.zero();
       this->ans->engv.zero();
       this->time_nbor1.zero();
-      this->time_ellipsoid.zero();                                 
+      this->time_ellipsoid.zero();
       this->time_nbor2.zero();
       this->time_ellipsoid2.zero();
       this->time_nbor3.zero();
       this->time_ellipsoid3.zero();
     }
-    
+
     // ------------         LJ      ---------------
     this->time_lj.start();
     if (this->_last_ellipse<this->ans->inum()) {
@@ -287,7 +287,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       } else {
         this->k_lj.set_size(GX,BX);
         this->k_lj.run(&this->atom->x, &this->lj1, &this->lj3,
-                       &this->_lj_types, &this->special_lj, &stride, 
+                       &this->_lj_types, &this->special_lj, &stride,
                        &this->nbor->dev_packed, &this->ans->force,
                        &this->ans->engv, &this->dev_error, &eflag, &vflag,
                        &this->_last_ellipse, &ainum, &this->_threads_per_atom);
@@ -300,15 +300,15 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
     NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
     this->time_nbor1.start();
     this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
-		                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
+                                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
     this->time_nbor1.stop();
-    this->time_ellipsoid.start(); 
+    this->time_ellipsoid.start();
     this->k_ellipsoid.set_size(GX,BX);
-    this->k_ellipsoid.run(&this->atom->x, &this->atom->quat, 
-                          &this->shape, &this->well, &this->special_lj, 
-                          &this->sigma_epsilon, &this->_lj_types, 
+    this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
+                          &this->shape, &this->well, &this->special_lj,
+                          &this->sigma_epsilon, &this->_lj_types,
                           &this->nbor->dev_nbor, &stride, &this->ans->force,
-                          &ainum,  &this->ans->engv, &this->dev_error, 
+                          &ainum,  &this->ans->engv, &this->dev_error,
                           &eflag, &vflag, &ainum, &this->_threads_per_atom);
     this->time_ellipsoid.stop();
   }
diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu
index 3a65ce14ce..e238734074 100644
--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@@ -34,31 +34,31 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
 
 __kernel void k_resquared(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict q,
-                          const __global numtyp4 *restrict shape, 
-                          const __global numtyp4 *restrict well, 
-                          const __global numtyp *restrict splj, 
-                          const __global numtyp2 *restrict sig_eps, 
-                          const int ntypes, 
+                          const __global numtyp4 *restrict shape,
+                          const __global numtyp4 *restrict well,
+                          const __global numtyp *restrict splj,
+                          const __global numtyp2 *restrict sig_eps,
+                          const int ntypes,
                           const __global int *dev_nbor,
-                          const int stride,  
+                          const int stride,
                           __global acctyp4 *restrict ans,
-                          const int astride, 
+                          const int astride,
                           __global acctyp *restrict engv,
-                          __global int *restrict err_flag, 
+                          __global int *restrict err_flag,
                           const int eflag, const int vflag, const int inum,
                           const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
-  sp_lj[0]=splj[0];    
-  sp_lj[1]=splj[1];    
-  sp_lj[2]=splj[2];    
+  sp_lj[0]=splj[0];
+  sp_lj[1]=splj[1];
+  sp_lj[2]=splj[2];
   sp_lj[3]=splj[3];
-  
+
   __local numtyp b_alpha, cr60;
   b_alpha=(numtyp)45.0/(numtyp)56.0;
-  cr60=ucl_cbrt((numtyp)60.0);    
+  cr60=ucl_cbrt((numtyp)60.0);
 
   acctyp energy=(acctyp)0;
   acctyp4 f;
@@ -79,7 +79,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
 
@@ -91,14 +91,14 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
     numtyp lAtwo1_0[9], lAtwo1_1[9], lAtwo1_2[9];  // A'*S^2*lA
     numtyp lAsa1_0[9], lAsa1_1[9], lAsa1_2[9];   // lAtwo+lA'*sa
     numtyp4 ishape;
-    
+
     ishape=shape[itype];
     numtyp4 ishape2;
     ishape2.x=ishape.x*ishape.x;
     ishape2.y=ishape.y*ishape.y;
     ishape2.z=ishape.z*ishape.z;
     numtyp ilshape = ishape.x*ishape.y*ishape.z;
-    
+
     {
       numtyp aTs[9];    // A1'*S1^2
       gpu_quat_to_mat_trans(q,i,a1);
@@ -148,7 +148,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
       numtyp a2[9];       // Rotation matrix (lab->body)
       numtyp gamma2[9];   // A'*S^2*A
       numtyp4 jshape;
-    
+
       jshape=shape[jtype];
       numtyp4 jshape2;
       jshape2.x=jshape.x*jshape.x;
@@ -189,7 +189,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
       H12[7] = gamma1[7]*sigma1+gamma2[7]*sigma2;
       H12[8] = gamma1[8]*sigma1+gamma2[8]*sigma2;
       dH=gpu_det3(H12);
-      
+
       numtyp sigma1p2, sigma2p2, lambda, nu;
       sigma1p2 = sigma1*sigma1;
       sigma2p2 = sigma2*sigma2;
@@ -299,7 +299,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
       hsec = ucl_recip(h12+(numtyp)3.0*sec);
       dspu = ucl_recip(h12)-hsec+stemp;
       pbsu = (numtyp)3.0*sigma*hsec;
-  
+
       numtyp dspr, pbsr;
       stemp = ucl_recip(ishape.x*cr60+h12)+
               ucl_recip(ishape.y*cr60+h12)+
@@ -310,7 +310,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
       hsec = ucl_recip(h12+b_alpha*sec);
       dspr = (numtyp)7.0/h12-hsec+stemp;
       pbsr = b_alpha*sigma*hsec;
-  
+
       numtyp dH12[9];
       numtyp dUa, dUr, deta, dchi, ddH, dh12;
       numtyp dsigma1, dsigma2;
diff --git a/lib/gpu/lal_re_squared.h b/lib/gpu/lal_re_squared.h
index c7441ed83e..8dc137d829 100644
--- a/lib/gpu/lal_re_squared.h
+++ b/lib/gpu/lal_re_squared.h
@@ -25,14 +25,14 @@ template <class numtyp, class acctyp>
 class RESquared : public BaseEllipsoid<numtyp, acctyp> {
  public:
   RESquared();
-  ~RESquared(); 
+  ~RESquared();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device 
+    * \param gpu_split fraction of particles handled by device
     * \return false if there is not sufficient memory or device init prob
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -41,7 +41,7 @@ class RESquared : public BaseEllipsoid<numtyp, acctyp> {
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_shape, double **host_well,
            double **host_cutsq, double **host_sigma,  double **host_epsilon,
-           int **h_form, double **host_lj1, double **host_lj2, 
+           int **h_form, double **host_lj1, double **host_lj2,
            double **host_lj3, double **host_lj4, double **host_offset,
            const double *host_special_lj, const int nlocal, const int nall,
            const int max_nbors, const int maxspecial, const double cell_size,
@@ -50,7 +50,7 @@ class RESquared : public BaseEllipsoid<numtyp, acctyp> {
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
- 
+
   /// Returns memory usage on device per atom
   int bytes_per_atom(const int max_nbors) const;
 
@@ -59,8 +59,8 @@ class RESquared : public BaseEllipsoid<numtyp, acctyp> {
 
   /// Device Error Flag - Set if a bad matrix inversion occurs
   UCL_D_Vec<int> dev_error;
-  
-  // --------------------------- TYPE DATA -------------------------- 
+
+  // --------------------------- TYPE DATA --------------------------
 
   /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form
   UCL_D_Vec<numtyp4> lj1;
@@ -70,12 +70,12 @@ class RESquared : public BaseEllipsoid<numtyp, acctyp> {
   UCL_D_Vec<numtyp2> sigma_epsilon;
   /// special lj 0-4
   UCL_D_Vec<numtyp> special_lj;
-  
+
   /// If atom type constants fit in shared memory, use fast kernels
   bool _shared_types;
   int _lj_types;
-   
-  // --------------------------- ATOM DATA -------------------------- 
+
+  // --------------------------- ATOM DATA --------------------------
 
   /// Aspherical Const Data for Atoms
   UCL_D_Vec<numtyp4> shape, well;
diff --git a/lib/gpu/lal_re_squared_ext.cpp b/lib/gpu/lal_re_squared_ext.cpp
index e1d8fffb8f..b719dfe05f 100644
--- a/lib/gpu/lal_re_squared_ext.cpp
+++ b/lib/gpu/lal_re_squared_ext.cpp
@@ -28,8 +28,8 @@ static RESquared<PRECISION,ACC_PRECISION> REMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
-                double **sigma, double **epsilon, 
-                int **form, double **host_lj1, double **host_lj2, 
+                double **sigma, double **epsilon,
+                int **form, double **host_lj1, double **host_lj2,
                 double **host_lj3, double **host_lj4, double **offset,
                 double *special_lj, const int inum, const int nall,
                 const int max_nbors, const int maxspecial,
@@ -56,7 +56,7 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=REMF.init(ntypes, shape, well, cutsq, sigma, epsilon, 
+    init_ok=REMF.init(ntypes, shape, well, cutsq, sigma, epsilon,
                       form, host_lj1, host_lj2, host_lj3, host_lj4, offset,
                       special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                       gpu_split, screen);
@@ -64,7 +64,7 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
   REMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
-        
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
@@ -75,13 +75,13 @@ int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=REMF.init(ntypes, shape, well, cutsq,  sigma, epsilon, 
+      init_ok=REMF.init(ntypes, shape, well, cutsq,  sigma, epsilon,
                         form, host_lj1, host_lj2, host_lj3,
                         host_lj4, offset, special_lj,  inum, nall,
                         max_nbors, maxspecial, cell_size, gpu_split, screen);
 
     REMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,8 +102,8 @@ void re_gpu_clear() {
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double **host_quat);
 
@@ -114,8 +114,8 @@ int** re_gpu_compute_n(const int ago, const int inum_full, const int nall,
                        const bool vatom, int &host_start, int **ilist,
                        int **jnum, const double cpu_time, bool &success,
                        double **host_quat) {
-  return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, 
-                      tag, nspecial, special, eflag, vflag, eatom, vatom, 
+  return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
+                      tag, nspecial, special, eflag, vflag, eatom, vatom,
                       host_start, ilist, jnum, cpu_time, success, host_quat);
 }
 
diff --git a/lib/gpu/lal_re_squared_lj.cu b/lib/gpu/lal_re_squared_lj.cu
index 4742e5bd8e..d69dae2461 100644
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@@ -129,32 +129,32 @@
 
 __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
                                            const __global numtyp4 *restrict q,
-                                           const __global numtyp4 *restrict shape, 
+                                           const __global numtyp4 *restrict shape,
                                            const __global numtyp4 *restrict well,
-                                           const __global numtyp *restrict splj, 
+                                           const __global numtyp *restrict splj,
                                            const __global numtyp2 *restrict sig_eps,
-                                           const int ntypes, 
+                                           const int ntypes,
                                            const __global int *dev_nbor,
-                                           const int stride, 
+                                           const int stride,
                                            __global acctyp4 *restrict ans,
-                                           const int astride, 
-                                           __global acctyp *restrict engv, 
-                                           __global int *restrict err_flag, 
-                                           const int eflag, const int vflag, 
-                                           const int inum, 
+                                           const int astride,
+                                           __global acctyp *restrict engv,
+                                           __global int *restrict err_flag,
+                                           const int eflag, const int vflag,
+                                           const int inum,
                                            const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
-  sp_lj[0]=splj[0];    
-  sp_lj[1]=splj[1];    
-  sp_lj[2]=splj[2];    
+  sp_lj[0]=splj[0];
+  sp_lj[1]=splj[1];
+  sp_lj[2]=splj[2];
   sp_lj[3]=splj[3];
-  
+
   __local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
   b_alpha=(numtyp)45.0/(numtyp)56.0;
-  cr60=ucl_cbrt((numtyp)60.0);    
+  cr60=ucl_cbrt((numtyp)60.0);
   solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0);
   solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
 
@@ -177,7 +177,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
 
@@ -223,7 +223,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
       sigma = sig_eps[mtype].x;
       epsilon = sig_eps[mtype].y*factor_lj;
 
-      numtyp aTs[9]; 
+      numtyp aTs[9];
       numtyp4 scorrect;
       numtyp half_sigma=sigma*(numtyp)0.5;
       scorrect.x = ishape.x+half_sigma;
@@ -260,7 +260,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
       Ua = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/(numtyp)8.0;
       Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua;
       Ua = epsilon*Ua*sigmap3*solv_f_a;
-    
+
       stemp = h12/cr60;
       Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/
            (numtyp)60.0;
@@ -290,7 +290,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
       numtyp hsec = ucl_recip(h12+(numtyp)3.0*sec);
       numtyp dspu = ucl_recip(h12)-hsec+stemp;
       numtyp pbsu = (numtyp)3.0*sigma*hsec;
-  
+
       stemp = ucl_recip(ishape.x*cr60+h12)+
               ucl_recip(ishape.y*cr60+h12)+
               ucl_recip(ishape.z*cr60+h12)+
@@ -298,7 +298,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
       hsec = ucl_recip(h12+b_alpha*sec);
       numtyp dspr = (numtyp)7.0/h12-hsec+stemp;
       numtyp pbsr = b_alpha*sigma*hsec;
-  
+
       #pragma unroll
       for (int i=0; i<3; i++) {
         numtyp u[3];
@@ -334,7 +334,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
         }
 
       }
-    
+
       // torque on i
       numtyp fwae[3];
       gpu_row_times3(fourw,aTe,fwae);
@@ -384,33 +384,33 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
 }
 
 __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
-                                           const __global numtyp4 *restrict q, 
+                                           const __global numtyp4 *restrict q,
                                            const __global numtyp4 *restrict shape,
                                            const __global numtyp4 *restrict well,
                                            const __global numtyp *restrict splj,
                                            const __global numtyp2 *restrict sig_eps,
-                                           const int ntypes, 
+                                           const int ntypes,
                                            const __global int *dev_nbor,
-                                           const int stride, 
+                                           const int stride,
                                            __global acctyp4 *restrict ans,
-                                           __global acctyp *restrict engv, 
+                                           __global acctyp *restrict engv,
                                            __global int *restrict err_flag,
                                            const int eflag, const int vflag,
-                                           const int start, const int inum, 
+                                           const int start, const int inum,
                                            const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
 
   __local numtyp sp_lj[4];
-  sp_lj[0]=splj[0];    
-  sp_lj[1]=splj[1];    
-  sp_lj[2]=splj[2];    
+  sp_lj[0]=splj[0];
+  sp_lj[1]=splj[1];
+  sp_lj[2]=splj[2];
   sp_lj[3]=splj[3];
-  
+
   __local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
   b_alpha=(numtyp)45.0/(numtyp)56.0;
-  cr60=ucl_cbrt((numtyp)60.0);    
+  cr60=ucl_cbrt((numtyp)60.0);
   solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0);
   solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
 
@@ -429,7 +429,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
     __local int n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
                 n_stride,nbor_end,nbor);
-  
+
     numtyp4 jx; fetch4(jx,j,pos_tex);
     int jtype=jx.w;
 
@@ -445,7 +445,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
       numtyp a[9];       // Rotation matrix (lab->body)
       numtyp aTe[9];     // A'*E
       numtyp4 ishape;
-    
+
       ishape=shape[itype];
       gpu_quat_to_mat_trans(q,i,a);
       gpu_transpose_times_diag3(a,well[itype],aTe);
@@ -467,7 +467,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
       sigma = sig_eps[mtype].x;
       epsilon = sig_eps[mtype].y*factor_lj;
 
-      numtyp aTs[9]; 
+      numtyp aTs[9];
       numtyp4 scorrect;
       numtyp half_sigma=sigma * (numtyp)0.5;
       scorrect.x = ishape.x+half_sigma;
@@ -477,7 +477,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
       scorrect.y = scorrect.y * scorrect.y * (numtyp)0.5;
       scorrect.z = scorrect.z * scorrect.z * (numtyp)0.5;
       gpu_transpose_times_diag3(a,scorrect,aTs);
-      
+
       // energy
 
       numtyp gamma[9], s[3];
@@ -505,7 +505,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
       numtyp ilshape=ishape.x*ishape.y*ishape.z;
       Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*ilshape/Ua;
       Ua = epsilon*Ua*sigmap3*solv_f_a;
-    
+
       stemp = h12/cr60;
       Ur = (ishape.x+stemp)*(ishape.y+stemp)*(ishape.z+stemp)*h12p3/
            (numtyp)60.0;
@@ -535,7 +535,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
       numtyp hsec = ucl_recip(h12+(numtyp)3.0*sec);
       numtyp dspu = ucl_recip(h12)-hsec+stemp;
       numtyp pbsu = (numtyp)3.0*sigma*hsec;
-  
+
       stemp = ucl_recip(ishape.x*cr60+h12)+
               ucl_recip(ishape.y*cr60+h12)+
               ucl_recip(ishape.z*cr60+h12)+
@@ -543,7 +543,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
       hsec = ucl_recip(h12+b_alpha*sec);
       numtyp dspr = (numtyp)7.0/h12-hsec+stemp;
       numtyp pbsr = b_alpha*sigma*hsec;
-  
+
       #pragma unroll
       for (int i=0; i<3; i++) {
         numtyp u[3];
@@ -584,15 +584,15 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_resquared_lj(const __global numtyp4 *restrict x_, 
-                             const __global numtyp4 *restrict lj1, 
-                             const __global numtyp4 *restrict lj3, 
-                             const int lj_types, 
-                             const __global numtyp *restrict gum, 
-                             const int stride, 
-                             const __global int *dev_ij, 
+__kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
+                             const __global numtyp4 *restrict lj1,
+                             const __global numtyp4 *restrict lj3,
+                             const int lj_types,
+                             const __global numtyp *restrict gum,
+                             const int stride,
+                             const __global int *dev_ij,
                              __global acctyp4 *restrict ans,
-                             __global acctyp *restrict engv, 
+                             __global acctyp *restrict engv,
                              __global int *restrict err_flag,
                              const int eflag, const int vflag, const int start,
                              const int inum, const int t_per_atom) {
@@ -601,10 +601,10 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
   ii+=start;
 
   __local numtyp sp_lj[4];
-  sp_lj[0]=gum[0];    
-  sp_lj[1]=gum[1];    
-  sp_lj[2]=gum[2];    
-  sp_lj[3]=gum[3];    
+  sp_lj[0]=gum[0];
+  sp_lj[1]=gum[1];
+  sp_lj[2]=gum[2];
+  sp_lj[3]=gum[3];
 
   acctyp energy=(acctyp)0;
   acctyp4 f;
@@ -614,20 +614,20 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_ij[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -640,21 +640,21 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       int ii=itype*lj_types+jtype;
       if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
         force*=factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
-          energy+=factor_lj*(e-lj3[ii].z); 
+          energy+=factor_lj*(e-lj3[ii].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -671,33 +671,33 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_, 
-                                  const __global numtyp4 *restrict lj1_in, 
-                                  const __global numtyp4 *restrict lj3_in, 
-                                  const __global numtyp *restrict gum, 
+__kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
+                                  const __global numtyp4 *restrict lj1_in,
+                                  const __global numtyp4 *restrict lj3_in,
+                                  const __global numtyp *restrict gum,
                                   const int stride,
                                   const __global int *dev_ij,
                                   __global acctyp4 *restrict ans,
-                                  __global acctyp *restrict engv, 
+                                  __global acctyp *restrict engv,
                                   __global int *restrict err_flag,
                                   const int eflag, const int vflag,
-                                  const int start, const int inum, 
+                                  const int start, const int inum,
                                   const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
 
-  __local numtyp sp_lj[4];                              
+  __local numtyp sp_lj[4];
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   if (tid<4)
-    sp_lj[tid]=gum[tid];    
+    sp_lj[tid]=gum[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
     if (eflag>0)
       lj3[tid]=lj3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0;
@@ -706,9 +706,9 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -722,7 +722,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_ij[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -735,19 +735,19 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
       if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          energy+=factor_lj*(e-lj3[mtype].z);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_soft.cpp b/lib/gpu/lal_soft.cpp
index c206a997a9..727b112ea5 100644
--- a/lib/gpu/lal_soft.cpp
+++ b/lib/gpu/lal_soft.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,10 +33,10 @@ SoftT::Soft() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-SoftT::~Soft() { 
+SoftT::~Soft() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int SoftT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -74,7 +74,7 @@ int SoftT::init(const int ntypes, double **host_cutsq,
 
   coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_prefactor,
-			                   host_cut,host_cutsq);
+                                           host_cut,host_cutsq);
 
   UCL_H_Vec<double> dview;
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@@ -89,16 +89,16 @@ int SoftT::init(const int ntypes, double **host_cutsq,
 template <class numtyp, class acctyp>
 void SoftT::reinit(const int ntypes, double **host_cutsq,
                    double **host_prefactor, double **host_cut) {
-  
+
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  
+
   for (int i=0; i<_lj_types*_lj_types; i++)
     host_write[i]=0.0;
-  
+
   this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_prefactor,
-			                   host_cut,host_cutsq);
+                                           host_cut,host_cutsq);
 }
 
 template <class numtyp, class acctyp>
@@ -134,7 +134,7 @@ void SoftT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
diff --git a/lib/gpu/lal_soft.cu b/lib/gpu/lal_soft.cu
index b7c32b6879..831b986725 100644
--- a/lib/gpu/lal_soft.cu
+++ b/lib/gpu/lal_soft.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -26,7 +26,7 @@ texture<int4,1> pos_tex;
 
 #define MY_PI (acctyp)3.14159265358979323846
 
-__kernel void k_soft(const __global numtyp4 *restrict x_, 
+__kernel void k_soft(const __global numtyp4 *restrict x_,
                      const __global numtyp4 *restrict coeff,
                      const int lj_types,
                      const __global numtyp *restrict sp_lj_in,
@@ -51,20 +51,20 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -77,7 +77,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<coeff[mtype].z) {
         numtyp force;
@@ -86,14 +86,14 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
         if (r > (numtyp)0.0) force = factor_lj * coeff[mtype].x *
                        sin(arg) * MY_PI/coeff[mtype].y*ucl_recip(r);
         else force = (numtyp)0.0;
-        
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=coeff[mtype].x * ((numtyp)1.0+cos(arg));
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -111,7 +111,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_soft_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_soft_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict coeff_in,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
@@ -122,7 +122,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
                           const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -130,7 +130,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -139,7 +139,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -153,7 +153,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -166,7 +166,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<coeff[mtype].z) {
         numtyp force;
         numtyp r = ucl_sqrt(rsq);
@@ -174,14 +174,14 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
         if (r > (numtyp)0.0) force = factor_lj * coeff[mtype].x *
                        sin(arg) * MY_PI/coeff[mtype].y*ucl_recip(r);
         else force = (numtyp)0.0;
-        
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=coeff[mtype].x * ((numtyp)1.0+cos(arg));
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_soft.h b/lib/gpu/lal_soft.h
index 7fa529c4f5..e72673248c 100644
--- a/lib/gpu/lal_soft.h
+++ b/lib/gpu/lal_soft.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Soft : public BaseAtomic<numtyp, acctyp> {
  public:
   Soft();
-  ~Soft(); 
+  ~Soft();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -40,14 +40,14 @@ class Soft : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double **host_cutsq,
            double **host_prefactor, double **host_cut,
            double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Send updated coeffs from host to device (to be compatible with fix adapt)
   void reinit(const int ntypes, double **host_cutsq,
               double **host_prefactor, double **host_cut);
-           
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -68,7 +68,7 @@ class Soft : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kßernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_soft_ext.cpp b/lib/gpu/lal_soft_ext.cpp
index 9591923965..d3b3fa2598 100644
--- a/lib/gpu/lal_soft_ext.cpp
+++ b/lib/gpu/lal_soft_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -77,7 +77,7 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
                         cell_size, gpu_split, screen);
 
     SLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -96,16 +96,16 @@ void soft_gpu_reinit(const int ntypes, double **cutsq, double **host_prefactor,
   int world_me=SLMF.device->world_me();
   int gpu_rank=SLMF.device->gpu_rank();
   int procs_per_gpu=SLMF.device->procs_per_gpu();
-  
+
   if (world_me==0)
     SLMF.reinit(ntypes, cutsq, host_prefactor, host_cut);
-  
+
   SLMF.device->world_barrier();
-  
+
   for (int i=0; i<procs_per_gpu; i++) {
     if (gpu_rank==i && world_me!=0)
       SLMF.reinit(ntypes, cutsq, host_prefactor, host_cut);
-    
+
     SLMF.device->gpu_barrier();
   }
 }
@@ -124,8 +124,8 @@ int ** soft_gpu_compute_n(const int ago, const int inum_full,
   return SLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void soft_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp
index 1f68616b0e..3492d7030e 100644
--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@@ -33,10 +33,10 @@ SWT::SW() : BaseThree<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-SWT::~SW() { 
+SWT::~SW() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int SWT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -45,7 +45,7 @@ int SWT::bytes_per_atom(const int max_nbors) const {
 template <class numtyp, class acctyp>
 int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_nbors,
            const double cell_size, const double gpu_split, FILE *_screen,
-           int* host_map, const int nelements, int*** host_elem2param, const int nparams, 
+           int* host_map, const int nelements, int*** host_elem2param, const int nparams,
            const double* epsilon, const double* sigma,
            const double* lambda, const double* gamma,
            const double* costheta, const double* biga,
@@ -76,41 +76,41 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_
                              UCL_WRITE_ONLY);
 
   for (int i=0; i<nparams; i++) {
-    dview[i].x=(numtyp)0; 
+    dview[i].x=(numtyp)0;
     dview[i].y=(numtyp)0;
-    dview[i].z=(numtyp)0; 
+    dview[i].z=(numtyp)0;
     dview[i].w=(numtyp)0;
   }
 
   // pack coefficients into arrays
   sw1.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
-  
+
   for (int i=0; i<nparams; i++) {
     dview[i].x=static_cast<numtyp>(epsilon[i]);
     dview[i].y=static_cast<numtyp>(sigma[i]);
     dview[i].z=static_cast<numtyp>(lambda[i]);
     dview[i].w=static_cast<numtyp>(gamma[i]);
   }
-  
+
   ucl_copy(sw1,dview,false);
   sw1_tex.get_texture(*(this->pair_program),"sw1_tex");
   sw1_tex.bind_float(sw1,4);
 
   sw2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
-  
+
   for (int i=0; i<nparams; i++) {
     dview[i].x=static_cast<numtyp>(biga[i]);
     dview[i].y=static_cast<numtyp>(bigb[i]);
     dview[i].z=static_cast<numtyp>(powerp[i]);
     dview[i].w=static_cast<numtyp>(powerq[i]);
   }
-  
+
   ucl_copy(sw2,dview,false);
   sw2_tex.get_texture(*(this->pair_program),"sw2_tex");
   sw2_tex.bind_float(sw2,4);
 
   sw3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
-  
+
   for (int i=0; i<nparams; i++) {
     double sw_cut = cut[i];
     double sw_cutsq = cutsq[i];
@@ -121,7 +121,7 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_
     dview[i].z=static_cast<numtyp>(costheta[i]);
     dview[i].w=(numtyp)0;
   }
-  
+
   ucl_copy(sw3,dview,false);
   sw3_tex.get_texture(*(this->pair_program),"sw3_tex");
   sw3_tex.bind_float(sw3,4);
@@ -192,31 +192,32 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
-  // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa == 1 
-  // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1
+  // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
+  // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
   int ainum=this->ans->inum();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
+
   this->k_pair.set_size(GX,BX);
-  this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3, 
+  this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
                    &map, &elem2param, &_nelements,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->ans->force, &this->ans->engv, 
-                   &eflag, &vflag, &ainum, &nbor_pitch, 
+                   &this->ans->force, &this->ans->engv,
+                   &eflag, &vflag, &ainum, &nbor_pitch,
                    &this->_threads_per_atom);
 
   BX=this->block_size();
   GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                           (BX/(KTHREADS*JTHREADS)))); 
+                           (BX/(KTHREADS*JTHREADS))));
   this->k_three_center.set_size(GX,BX);
-  this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3, 
+  this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
                            &map, &elem2param, &_nelements,
-                           &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, 
+                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
 
   Answer<numtyp,acctyp> *end_ans;
@@ -227,21 +228,24 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   #endif
   if (evatom!=0) {
     this->k_three_end_vatom.set_size(GX,BX);
-    this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3, 
-                          &map, &elem2param, &_nelements, 
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
+                          &map, &elem2param, &_nelements,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
 
   } else {
     this->k_three_end.set_size(GX,BX);
-    this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3, 
-                          &map, &elem2param, &_nelements, 
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, 
-                          &nbor_pitch, &this->_threads_per_atom);
+    this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
+                          &map, &elem2param, &_nelements,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
 
   }
+
   this->time_pair.stop();
 }
 
diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu
index 1e358fb6f7..46330c59e4 100644
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@@ -138,16 +138,16 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
                    const __global int *restrict map,
                    const __global int *restrict elem2param,
                    const int nelements,
-                   const __global int * dev_nbor, 
-                   const __global int * dev_packed, 
-                   __global acctyp4 *restrict ans, 
-                   __global acctyp *restrict engv, 
-                   const int eflag, const int vflag, const int inum, 
+                   const __global int * dev_nbor,
+                   const __global int * dev_packed,
+                   __global acctyp4 *restrict ans,
+                   __global acctyp *restrict engv,
+                   const int eflag, const int vflag, const int inum,
                    const int nbor_pitch, const int t_per_atom) {
   __local int n_stride;
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -166,9 +166,9 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
-    
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
@@ -183,7 +183,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
         numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
         numtyp sw_epsilon=sw1_ijparam.x;
@@ -195,7 +195,6 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
         numtyp sw_powerq=sw2_ijparam.w;
         numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
         numtyp sw_cut=sw3_ijparam.x;
-        numtyp sw_cutsq=sw3_ijparam.y;
         numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
             pow(sw_sigma,sw_powerp);
         numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
@@ -218,13 +217,13 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
         numtyp force = (pre_sw_c1*rp-pre_sw_c2*rq +
                        (pre_sw_c3*rp-pre_sw_c4*rq) * rainv)*
                        expsrainv*ucl_recip(rsq);
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) 
-          energy+=(pre_sw_c5*rp - pre_sw_c6*rq) * expsrainv; 
+        if (eflag>0)
+          energy+=(pre_sw_c5*rp - pre_sw_c6*rq) * expsrainv;
 
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -329,29 +328,28 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
   fjz = delr1z*(frad1+csfac1)-delr2z*facang12;                               \
 }
 
-__kernel void k_sw_three_center(const __global numtyp4 *restrict x_, 
+__kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
                                 const __global numtyp4 *restrict sw1,
                                 const __global numtyp4 *restrict sw2,
                                 const __global numtyp4 *restrict sw3,
                                 const __global int *restrict map,
                                 const __global int *restrict elem2param,
                                 const int nelements,
-                                const __global int * dev_nbor, 
-                                const __global int * dev_packed, 
-                                __global acctyp4 *restrict ans, 
-                                __global acctyp *restrict engv, 
-                                const int eflag, const int vflag, 
-                                const int inum,  const int nbor_pitch, 
+                                const __global int * dev_nbor,
+                                const __global int * dev_packed,
+                                __global acctyp4 *restrict ans,
+                                __global acctyp *restrict engv,
+                                const int eflag, const int vflag,
+                                const int inum,  const int nbor_pitch,
                                 const int t_per_atom, const int evatom) {
   __local int tpa_sq, n_stride;
   tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
   numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
   numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
 
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -360,7 +358,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
 
@@ -370,11 +368,11 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    int itype=ix.w; 
+    int itype=ix.w;
     itype=map[itype];
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-  
+
       int j=dev_packed[nbor_j];
       j &= NEIGHMASK;
 
@@ -394,8 +392,6 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
       if (rsq1 > sw3_ijparam.y) continue;
 
       numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-      sw_sigma=sw1_ijparam.y;
-      sw_gamma=sw1_ijparam.w;
       sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
       sw_cut_ij=sw3_ijparam.x;
 
@@ -419,15 +415,11 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
         numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
         if (rsq2 < sw3_ikparam.y) {   // sw_cutsq=sw3[ikparam].y;
           numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
-          sw_sigma=sw1_ikparam.y;
-          sw_gamma=sw1_ikparam.w;
           sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
           sw_cut_ik=sw3_ikparam.x;
 
           int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
           numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
-          sw_epsilon=sw1_ijkparam.x;
-          sw_lambda=sw1_ijkparam.z;
           sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
           sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
           numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
@@ -439,7 +431,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
           f.x -= fjx + fkx;
           f.y -= fjy + fky;
           f.z -= fjz + fkz;
-        }  
+        }
       }
     } // for nbor
 
@@ -458,29 +450,29 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_sw_three_end(const __global numtyp4 *restrict x_, 
+__kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict sw1,
                              const __global numtyp4 *restrict sw2,
                              const __global numtyp4 *restrict sw3,
                              const __global int *restrict map,
                              const __global int *restrict elem2param,
                              const int nelements,
-                             const __global int * dev_nbor, 
-                             const __global int * dev_packed, 
-                             __global acctyp4 *restrict ans, 
-                             __global acctyp *restrict engv, 
-                             const int eflag, const int vflag, 
-                             const int inum,  const int nbor_pitch, 
-                             const int t_per_atom) {
+                             const __global int * dev_nbor,
+                             const __global int * dev_packed,
+                             const __global int * dev_acc,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv,
+                             const int eflag, const int vflag,
+                             const int inum,  const int nbor_pitch,
+                             const int t_per_atom, const int gpu_nbor) {
   __local int tpa_sq, n_stride;
   tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
   numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
   numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
 
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -489,7 +481,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
 
@@ -522,18 +514,20 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
       if (rsq1 > sw3_ijparam.y) continue;
 
       numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-      sw_sigma=sw1_ijparam.y;
-      sw_gamma=sw1_ijparam.w;
       sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
       sw_cut_ij=sw3_ijparam.x;
 
-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k]; 
+      int nbor_k,numk;
       if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
         nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
         k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
         nbor_k+=offset_k;
       } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
         nbor_k+=nbor_pitch;
         nbor_k=dev_nbor[nbor_k];
         k_end=nbor_k+numk;
@@ -559,15 +553,11 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
 
         if (rsq2 < sw3_ikparam.y) {
           numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
-          sw_sigma=sw1_ikparam.y;
-          sw_gamma=sw1_ikparam.w;
           sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
           sw_cut_ik=sw3_ikparam.x;
 
           int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; //jik
           numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
-          sw_epsilon=sw1_ijkparam.x;
-          sw_lambda=sw1_ijkparam.z;
           sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
           sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
           numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
@@ -605,22 +595,22 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
                              const __global int *restrict map,
                              const __global int *restrict elem2param,
                              const int nelements,
-                             const __global int * dev_nbor, 
-                             const __global int * dev_packed, 
-                             __global acctyp4 *restrict ans, 
-                             __global acctyp *restrict engv, 
-                             const int eflag, const int vflag, 
-                             const int inum,  const int nbor_pitch, 
-                             const int t_per_atom) {
+                             const __global int * dev_nbor,
+                             const __global int * dev_packed,
+                             const __global int * dev_acc,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv,
+                             const int eflag, const int vflag,
+                             const int inum,  const int nbor_pitch,
+                             const int t_per_atom, const int gpu_nbor) {
   __local int tpa_sq, n_stride;
   tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
   numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
   numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
 
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -629,7 +619,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
 
@@ -662,18 +652,20 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
       if (rsq1 > sw3_ijparam.y) continue;
 
       numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-      sw_sigma=sw1_ijparam.y;
-      sw_gamma=sw1_ijparam.w;
       sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
       sw_cut_ij=sw3_ijparam.x;
-        
-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k]; 
+
+      int nbor_k,numk;
       if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
         nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
         k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
         nbor_k+=offset_k;
       } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
         nbor_k+=nbor_pitch;
         nbor_k=dev_nbor[nbor_k];
         k_end=nbor_k+numk;
@@ -699,15 +691,11 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
 
         if (rsq2 < sw3_ikparam.y) {
           numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
-          sw_sigma=sw1_ikparam.y;
-          sw_gamma=sw1_ikparam.w;
           sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
           sw_cut_ik=sw3_ikparam.x;
 
           int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; // jik
           numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
-          sw_epsilon=sw1_ijkparam.x;
-          sw_lambda=sw1_ijkparam.z;
           sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
           sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
           numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
diff --git a/lib/gpu/lal_sw.h b/lib/gpu/lal_sw.h
index 66b36a90b0..3546f02eb7 100644
--- a/lib/gpu/lal_sw.h
+++ b/lib/gpu/lal_sw.h
@@ -24,28 +24,28 @@ template <class numtyp, class acctyp>
 class SW : public BaseThree<numtyp, acctyp> {
  public:
   SW();
-  ~SW(); 
+  ~SW();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, const int nlocal, const int nall, const int max_nbors, 
+  int init(const int ntypes, const int nlocal, const int nall, const int max_nbors,
            const double cell_size, const double gpu_split, FILE *screen,
-           int* host_map, const int nelements, int*** host_elem2param, const int nparams, 
+           int* host_map, const int nelements, int*** host_elem2param, const int nparams,
            const double* epsilon, const double* sigma,
            const double* lambda, const double* gamma,
            const double* costheta, const double* biga,
            const double* bigb, const double* powerp,
            const double* powerq, const double* cut, const double* cutsq);
-           
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -61,7 +61,7 @@ class SW : public BaseThree<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types   
+  /// Number of atom types
   int _lj_types;
 
   /// sw1.x = epsilon, sw1.y = sigma, sw1.z = lambda, sw1.w = gamma
diff --git a/lib/gpu/lal_sw_ext.cpp b/lib/gpu/lal_sw_ext.cpp
index e2d1b5e4dd..4959650c90 100644
--- a/lib/gpu/lal_sw_ext.cpp
+++ b/lib/gpu/lal_sw_ext.cpp
@@ -27,14 +27,14 @@ static SW<PRECISION,ACC_PRECISION> SWMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, 
+int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors,
                 const double cell_size, int &gpu_mode, FILE *screen,
                 int* host_map, const int nelements, int*** host_elem2param, const int nparams,
                 const double* sw_epsilon, const double* sw_sigma,
                 const double* sw_lambda, const double* sw_gamma,
                 const double* sw_costheta, const double* sw_biga,
                 const double* sw_bigb, const double* sw_powerp,
-                const double* sw_powerq, const double* sw_cut, 
+                const double* sw_powerq, const double* sw_cut,
                 const double* sw_cutsq) {
   SWMF.clear();
   gpu_mode=SWMF.device->gpu_mode();
@@ -46,7 +46,7 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_
   int procs_per_gpu=SWMF.device->procs_per_gpu();
 
   // disable host/device split for now
-  if (gpu_split != 1.0) 
+  if (gpu_split != 1.0)
     return -8;
 
   SWMF.device->init_message(screen,"sw/gpu",first_gpu,last_gpu);
@@ -64,7 +64,7 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_
   if (world_me==0)
     init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
                       host_map, nelements, host_elem2param, nparams,
-                      sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, 
+                      sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
                       sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq);
 
   SWMF.device->world_barrier();
@@ -83,12 +83,12 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_
     if (gpu_rank==i && world_me!=0)
       init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
                         host_map, nelements, host_elem2param, nparams,
-                        sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, 
-                        sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, 
+                        sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
+                        sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut,
                         sw_cutsq);
 
     SWMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -113,12 +113,12 @@ int ** sw_gpu_compute_n(const int ago, const int inum_full,
   return SWMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
-void sw_gpu_compute(const int ago, const int nlocal, const int nall, 
-                    const int nlist, double **host_x, int *host_type, 
-                    int *ilist, int *numj, int **firstneigh, const bool eflag, 
-                    const bool vflag, const bool eatom, const bool vatom, 
+}
+
+void sw_gpu_compute(const int ago, const int nlocal, const int nall,
+                    const int nlist, double **host_x, int *host_type,
+                    int *ilist, int *numj, int **firstneigh, const bool eflag,
+                    const bool vflag, const bool eatom, const bool vatom,
                     int &host_start, const double cpu_time, bool &success) {
   SWMF.compute(ago,nlocal,nall,nlist,host_x,host_type,ilist,numj,
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
diff --git a/lib/gpu/lal_table.cpp b/lib/gpu/lal_table.cpp
index c99bf85815..0de59c84b2 100644
--- a/lib/gpu/lal_table.cpp
+++ b/lib/gpu/lal_table.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -34,35 +34,35 @@ using namespace LAMMPS_AL;
 extern Device<PRECISION,ACC_PRECISION> device;
 
 template <class numtyp, class acctyp>
-TableT::Table() : BaseAtomic<numtyp,acctyp>(), 
+TableT::Table() : BaseAtomic<numtyp,acctyp>(),
   _allocated(false), _compiled_styles(false) {
 }
 
 template <class numtyp, class acctyp>
-TableT::~Table() { 
+TableT::~Table() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int TableT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int TableT::init(const int ntypes, 
+int TableT::init(const int ntypes,
                 double **host_cutsq, double ***host_table_coeffs,
                 double **host_table_data,
                 double *host_special_lj, const int nlocal,
                 const int nall, const int max_nbors,
                 const int maxspecial, const double cell_size,
-                const double gpu_split, FILE *_screen, 
+                const double gpu_split, FILE *_screen,
                 int tabstyle, int ntables, int tablength) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
                             gpu_split,_screen,table,"k_table");
   if (success!=0)
     return success;
-  
+
   k_pair_linear.set_function(*(this->pair_program),"k_table_linear");
   k_pair_linear_fast.set_function(*(this->pair_program),"k_table_linear_fast");
   k_pair_spline.set_function(*(this->pair_program),"k_table_spline");
@@ -80,38 +80,38 @@ int TableT::init(const int ntypes,
     shared_types=true;
   }
   _lj_types=lj_types;
-  
+
   _tabstyle = tabstyle;
   _ntables = ntables;
   if (tabstyle != BITMAP) _tablength = tablength;
   else _tablength = 1 << tablength;
-  
+
   // Allocate a host write buffer for data initialization
   UCL_H_Vec<int> host_write_int(lj_types*lj_types,*(this->ucl_device),
                                UCL_WRITE_ONLY);
 
-  for (int i=0; i<lj_types*lj_types; i++) 
+  for (int i=0; i<lj_types*lj_types; i++)
     host_write_int[i] = 0;
-    
+
   tabindex.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   nshiftbits.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   nmask.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
 
   for (int ix=1; ix<ntypes; ix++)
-    for (int iy=1; iy<ntypes; iy++) 
+    for (int iy=1; iy<ntypes; iy++)
       host_write_int[ix*lj_types+iy] = (int)host_table_coeffs[ix][iy][0]; // tabindex
   ucl_copy(tabindex,host_write_int,false);
-  
+
   for (int ix=1; ix<ntypes; ix++)
-    for (int iy=1; iy<ntypes; iy++)   
+    for (int iy=1; iy<ntypes; iy++)
       host_write_int[ix*lj_types+iy] = (int)host_table_coeffs[ix][iy][1]; // nshiftbits
   ucl_copy(nshiftbits,host_write_int,false);
-  
+
   for (int ix=1; ix<ntypes; ix++)
-    for (int iy=1; iy<ntypes; iy++)     
+    for (int iy=1; iy<ntypes; iy++)
       host_write_int[ix*lj_types+iy] = (int)host_table_coeffs[ix][iy][2]; // nmask
   ucl_copy(nmask,host_write_int,false);
-  
+
   UCL_H_Vec<numtyp4> host_write(lj_types*lj_types,*(this->ucl_device),
                                UCL_WRITE_ONLY);
 
@@ -151,7 +151,7 @@ int TableT::init(const int ntypes,
           host_write2[n*_tablength+k].z = host_table_data[n][6*k+2]; // f
           host_write2[n*_tablength+k].w = (numtyp)0;
       }
-    } 
+    }
   }
   ucl_copy(coeff3,host_write2,false);
 
@@ -166,21 +166,21 @@ int TableT::init(const int ntypes,
   for (int n=0; n<_ntables; n++) {
     if (tabstyle == LINEAR) {
       for (int k=0; k<_tablength-1; k++) {
-        host_write2[n*_tablength+k].x = (numtyp)0; 
+        host_write2[n*_tablength+k].x = (numtyp)0;
         host_write2[n*_tablength+k].y = host_table_data[n][6*k+3]; // de
         host_write2[n*_tablength+k].z = host_table_data[n][6*k+4]; // df
         host_write2[n*_tablength+k].w = (numtyp)0;
       }
     } else if (tabstyle == SPLINE) {
       for (int k=0; k<_tablength; k++) {
-        host_write2[n*_tablength+k].x = (numtyp)0; 
+        host_write2[n*_tablength+k].x = (numtyp)0;
         host_write2[n*_tablength+k].y = host_table_data[n][6*k+3]; // e2
         host_write2[n*_tablength+k].z = host_table_data[n][6*k+4]; // f2
         host_write2[n*_tablength+k].w = (numtyp)0;
       }
     } else if (tabstyle == BITMAP) {
       for (int k=0; k<_tablength; k++) {
-        host_write2[n*_tablength+k].x = (numtyp)0; 
+        host_write2[n*_tablength+k].x = (numtyp)0;
         host_write2[n*_tablength+k].y = host_table_data[n][6*k+3]; // de
         host_write2[n*_tablength+k].z = host_table_data[n][6*k+4]; // df
         host_write2[n*_tablength+k].w = host_table_data[n][6*k+5]; // drsq
@@ -188,12 +188,12 @@ int TableT::init(const int ntypes,
     }
   }
   ucl_copy(coeff4,host_write2,false);
-  
+
   UCL_H_Vec<numtyp> host_rsq(lj_types*lj_types,*(this->ucl_device),
                              UCL_WRITE_ONLY);
   cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq);
-            
+
   UCL_H_Vec<double> dview;
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
   dview.view(host_special_lj,4,*(this->ucl_device));
@@ -220,7 +220,7 @@ void TableT::clear() {
   coeff3.clear();
   coeff4.clear();
   sp_lj.clear();
-  
+
   if (_compiled_styles) {
     k_pair_linear_fast.clear();
     k_pair_linear.clear();
@@ -230,7 +230,7 @@ void TableT::clear() {
     k_pair_bitmap.clear();
     _compiled_styles=false;
   }
-  
+
   this->clear_atomic();
 }
 
@@ -256,7 +256,7 @@ void TableT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -269,67 +269,67 @@ void TableT::loop(const bool _eflag, const bool _vflag) {
       this->k_pair_fast.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
                             &coeff4, &cutsq, &sp_lj, &this->nbor->dev_nbor,
                             &this->_nbor_data->begin(), &this->ans->force,
-                            &this->ans->engv, &eflag, &vflag, &ainum, 
+                            &this->ans->engv, &eflag, &vflag, &ainum,
                             &nbor_pitch, &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == LINEAR) {
       this->k_pair_linear_fast.set_size(GX,BX);
-      this->k_pair_linear_fast.run(&this->atom->x, &tabindex, &coeff2, 
+      this->k_pair_linear_fast.run(&this->atom->x, &tabindex, &coeff2,
                                    &coeff3, &coeff4, &cutsq, &sp_lj,
                                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                                    &this->ans->force, &this->ans->engv,
-                                   &eflag, &vflag, &ainum, &nbor_pitch, 
+                                   &eflag, &vflag, &ainum, &nbor_pitch,
                                    &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == SPLINE) {
       this->k_pair_spline_fast.set_size(GX,BX);
-      this->k_pair_spline_fast.run(&this->atom->x, &tabindex, &coeff2, 
+      this->k_pair_spline_fast.run(&this->atom->x, &tabindex, &coeff2,
                                    &coeff3, &coeff4, &cutsq, &sp_lj,
                                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                                    &this->ans->force, &this->ans->engv,
-                                   &eflag, &vflag, &ainum, &nbor_pitch, 
+                                   &eflag, &vflag, &ainum, &nbor_pitch,
                                    &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == BITMAP) {
       this->k_pair_bitmap_fast.set_size(GX,BX);
       this->k_pair_bitmap_fast.run(&this->atom->x, &tabindex, &nshiftbits,
                                    &nmask, &coeff2, &coeff3, &coeff4, &cutsq,
-                                   &sp_lj, &this->nbor->dev_nbor, 
+                                   &sp_lj, &this->nbor->dev_nbor,
                                    &this->_nbor_data->begin(), &this->ans->force,
                                    &this->ans->engv, &eflag, &vflag,
-                                   &ainum, &nbor_pitch, 
+                                   &ainum, &nbor_pitch,
                                    &this->_threads_per_atom, &_tablength);
-    } 
+    }
   } else {
     if (_tabstyle == LOOKUP) {
       this->k_pair.set_size(GX,BX);
-      this->k_pair.run(&this->atom->x, &tabindex, &coeff2, &coeff3, 
-                       &coeff4, &_lj_types, &cutsq, &sp_lj, 
+      this->k_pair.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
+                       &coeff4, &_lj_types, &cutsq, &sp_lj,
                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                       &this->ans->force, &this->ans->engv, &eflag, 
+                       &this->ans->force, &this->ans->engv, &eflag,
                        &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom,
                        &_tablength);
     } else if (_tabstyle == LINEAR) {
       this->k_pair_linear.set_size(GX,BX);
       this->k_pair_linear.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
-                              &coeff4, &_lj_types, &cutsq, &sp_lj, 
+                              &coeff4, &_lj_types, &cutsq, &sp_lj,
                               &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                               &this->ans->force, &this->ans->engv, &eflag,
-                              &vflag, &ainum, &nbor_pitch, 
+                              &vflag, &ainum, &nbor_pitch,
                               &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == SPLINE) {
       this->k_pair_spline.set_size(GX,BX);
       this->k_pair_spline.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
-                              &coeff4, &_lj_types, &cutsq, &sp_lj, 
-                              &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                              &coeff4, &_lj_types, &cutsq, &sp_lj,
+                              &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                               &this->ans->force, &this->ans->engv, &eflag,
-                              &vflag, &ainum, &nbor_pitch, 
+                              &vflag, &ainum, &nbor_pitch,
                               &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == BITMAP) {
       this->k_pair_bitmap.set_size(GX,BX);
-      this->k_pair_bitmap.run(&this->atom->x, &tabindex, &nshiftbits, 
+      this->k_pair_bitmap.run(&this->atom->x, &tabindex, &nshiftbits,
                               &nmask, &coeff2, &coeff3, &coeff4, &_lj_types,
                               &cutsq, &sp_lj, &this->nbor->dev_nbor,
                               &this->_nbor_data->begin(), &this->ans->force,
                               &this->ans->engv, &eflag, &vflag, &ainum,
-                              &nbor_pitch, &this->_threads_per_atom, 
+                              &nbor_pitch, &this->_threads_per_atom,
                               &_tablength);
     }
   }
diff --git a/lib/gpu/lal_table.cu b/lib/gpu/lal_table.cu
index 1033b7fbb8..971b56d96e 100644
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -39,39 +39,39 @@ typedef union {
 
 /// ---------------- LOOKUP -------------------------------------------------
 
-__kernel void k_table(const __global numtyp4 *restrict x_, 
+__kernel void k_table(const __global numtyp4 *restrict x_,
                       const __global int *restrict tabindex,
-                      const __global numtyp4 *restrict coeff2, 
+                      const __global numtyp4 *restrict coeff2,
                       const __global numtyp4 *restrict coeff3,
                       const __global numtyp4 *restrict coeff4,
                       const int lj_types,
                       const __global numtyp *restrict cutsq,
-                      const __global numtyp *restrict sp_lj_in, 
-                      const __global int *dev_nbor, 
-                      const __global int *dev_packed, 
-                      __global acctyp4 *restrict ans, 
-                      __global acctyp *restrict engv, 
-                      const int eflag, const int vflag, const int inum, 
-                      const int nbor_pitch, const int t_per_atom, 
+                      const __global numtyp *restrict sp_lj_in,
+                      const __global int *dev_nbor,
+                      const __global int *dev_packed,
+                      __global acctyp4 *restrict ans,
+                      __global acctyp *restrict engv,
+                      const int eflag, const int vflag, const int inum,
+                      const int nbor_pitch, const int t_per_atom,
                       int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp sp_lj[4];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   int tlm1 = tablength - 1;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -81,10 +81,10 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
-    
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -92,13 +92,13 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype*lj_types+jx.w;
       int tbindex = tabindex[mtype];
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-          
+
       if (rsq<cutsq[mtype]) {
         int itable=0,idx;
         numtyp force = (numtyp)0;
@@ -107,14 +107,14 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
           idx = itable + tbindex*tablength;
           force = factor_lj * coeff3[idx].z;
         } else force = (numtyp)0.0;
-                       
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e = (numtyp)0.0;
-          if (itable < tlm1) 
+          if (itable < tlm1)
             e = coeff3[idx].y;
           energy+=factor_lj*e;
         }
@@ -136,21 +136,21 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
 
 __kernel void k_table_fast(const __global numtyp4 *restrict x_,
                            const __global int *restrict tabindex,
-                           const __global numtyp4 *restrict coeff2, 
+                           const __global numtyp4 *restrict coeff2,
                            const __global numtyp4 *restrict coeff3,
                            const __global numtyp4 *restrict coeff4,
                            const __global numtyp *restrict cutsq_in,
-                           const __global numtyp *restrict sp_lj_in, 
-                           const __global int *dev_nbor, 
-                           const __global int *dev_packed, 
-                           __global acctyp4 *restrict ans, 
-                           __global acctyp *restrict engv, 
-                           const int eflag, const int vflag, const int inum, 
-                           const int nbor_pitch, const int t_per_atom, 
+                           const __global numtyp *restrict sp_lj_in,
+                           const __global int *dev_nbor,
+                           const __global int *dev_packed,
+                           __global acctyp4 *restrict ans,
+                           __global acctyp *restrict engv,
+                           const int eflag, const int vflag, const int inum,
+                           const int nbor_pitch, const int t_per_atom,
                            int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -158,18 +158,18 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=cutsq_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
- 
+
   int tlm1 = tablength - 1;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -180,10 +180,10 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-    
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -191,13 +191,13 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
       int tbindex = tabindex[mtype];
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-          
+
       if (rsq<cutsq[mtype]) {
         int itable=0,idx;
         numtyp force = (numtyp)0;
@@ -206,14 +206,14 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
           idx = itable + tbindex*tablength;
           force = factor_lj * coeff3[idx].z;
         } else force = (numtyp)0.0;
-                       
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e = (numtyp)0.0;
-          if (itable < tlm1) 
+          if (itable < tlm1)
             e = coeff3[idx].y;
           energy+=factor_lj*e;
         }
@@ -235,24 +235,24 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
 
 /// ---------------- LINEAR -------------------------------------------------
 
-__kernel void k_table_linear(const __global numtyp4 *restrict x_, 
+__kernel void k_table_linear(const __global numtyp4 *restrict x_,
                              const __global int *restrict tabindex,
-                             const __global numtyp4 *restrict coeff2, 
+                             const __global numtyp4 *restrict coeff2,
                              const __global numtyp4 *restrict coeff3,
                              const __global numtyp4 *restrict coeff4,
                              const int lj_types,
                              const __global numtyp *restrict cutsq,
-                             const __global numtyp *restrict sp_lj_in, 
-                             const __global int *dev_nbor, 
-                             const __global int *dev_packed, 
-                             __global acctyp4 *restrict ans, 
-                             __global acctyp *restrict engv, 
-                             const int eflag, const int vflag, const int inum, 
-                             const int nbor_pitch, const int t_per_atom, 
+                             const __global numtyp *restrict sp_lj_in,
+                             const __global int *dev_nbor,
+                             const __global int *dev_packed,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv,
+                             const int eflag, const int vflag, const int inum,
+                             const int nbor_pitch, const int t_per_atom,
                              int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp sp_lj[4];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
@@ -265,9 +265,9 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   int tlm1 = tablength - 1;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -277,10 +277,10 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
-    
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -288,13 +288,13 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype*lj_types+jx.w;
       int tbindex = tabindex[mtype];
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-          
+
       if (rsq<cutsq[mtype]) {
         int itable=0,idx;
         numtyp fraction=(numtyp)0;
@@ -307,14 +307,14 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
           value = coeff3[idx].z + fraction*coeff4[idx].z;
           force = factor_lj * value;
         } else force = (numtyp)0.0;
-             
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e = (numtyp)0.0;
-          if (itable < tlm1) 
+          if (itable < tlm1)
             e = coeff3[idx].y + fraction*coeff4[idx].y;
           energy+=factor_lj*e;
         }
@@ -334,23 +334,23 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_table_linear_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
                                   const __global int *restrict tabindex,
-                                  const __global numtyp4 *restrict coeff2, 
+                                  const __global numtyp4 *restrict coeff2,
                                   const __global numtyp4 *restrict coeff3,
                                   const __global numtyp4 *restrict coeff4,
                                   const __global numtyp *restrict cutsq_in,
-                                  const __global numtyp *restrict sp_lj_in, 
-                                  const __global int *dev_nbor, 
-                                  const __global int *dev_packed, 
-                                  __global acctyp4 *restrict ans, 
-                                  __global acctyp *restrict engv, 
-                                  const int eflag, const int vflag, 
-                                  const int inum, const int nbor_pitch, 
+                                  const __global numtyp *restrict sp_lj_in,
+                                  const __global int *dev_nbor,
+                                  const __global int *dev_packed,
+                                  __global acctyp4 *restrict ans,
+                                  __global acctyp *restrict engv,
+                                  const int eflag, const int vflag,
+                                  const int inum, const int nbor_pitch,
                                   const int t_per_atom, int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -358,7 +358,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=cutsq_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -369,7 +369,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   int tlm1 = tablength - 1;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -380,10 +380,10 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-    
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -391,13 +391,13 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
       int tbindex = tabindex[mtype];
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-          
+
       if (rsq<cutsq[mtype]) {
         int itable=0,idx;
         numtyp fraction=(numtyp)0;
@@ -410,14 +410,14 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
           value = coeff3[idx].z + fraction*coeff4[idx].z;
           force = factor_lj * value;
         } else force = (numtyp)0.0;
-             
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e = (numtyp)0.0;
-          if (itable < tlm1) 
+          if (itable < tlm1)
             e = coeff3[idx].y + fraction*coeff4[idx].y;
           energy+=factor_lj*e;
         }
@@ -439,39 +439,39 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
 
 /// ---------------- SPLINE -------------------------------------------------
 
-__kernel void k_table_spline(const __global numtyp4 *restrict x_, 
+__kernel void k_table_spline(const __global numtyp4 *restrict x_,
                              const __global int *restrict tabindex,
-                             const __global numtyp4 *restrict coeff2, 
+                             const __global numtyp4 *restrict coeff2,
                              const __global numtyp4 *restrict coeff3,
                              const __global numtyp4 *restrict coeff4,
                              const int lj_types,
                              const __global numtyp *restrict cutsq,
-                             const __global numtyp *restrict sp_lj_in, 
-                             const __global int *dev_nbor, 
-                             const __global int *dev_packed, 
-                             __global acctyp4 *restrict ans, 
-                             __global acctyp *restrict engv, 
-                             const int eflag, const int vflag, const int inum, 
-                             const int nbor_pitch, const int t_per_atom, 
+                             const __global numtyp *restrict sp_lj_in,
+                             const __global int *dev_nbor,
+                             const __global int *dev_packed,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv,
+                             const int eflag, const int vflag, const int inum,
+                             const int nbor_pitch, const int t_per_atom,
                              int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp sp_lj[4];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-    
+
   int tlm1 = tablength - 1;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -481,10 +481,10 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
-    
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -492,13 +492,13 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype*lj_types+jx.w;
       int tbindex = tabindex[mtype];
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-          
+
       if (rsq<cutsq[mtype]) {
         int itable=0,idx;
         numtyp a = (numtyp)0;
@@ -510,12 +510,12 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
           idx = itable + tbindex*tablength;
           b = (rsq - coeff3[idx].x) * coeff2[mtype].y;
           a = (numtyp)1.0 - b;
-          value = a * coeff3[idx].z + b * coeff3[idx+1].z + 
-            ((a*a*a-a)*coeff4[idx].z + (b*b*b-b)*coeff4[idx+1].z) * 
+          value = a * coeff3[idx].z + b * coeff3[idx+1].z +
+            ((a*a*a-a)*coeff4[idx].z + (b*b*b-b)*coeff4[idx+1].z) *
                   coeff2[mtype].z;
           force = factor_lj * value;
         } else force = (numtyp)0.0;
-              
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -523,10 +523,10 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
         if (eflag>0) {
           numtyp e = (numtyp)0.0;
           if (itable < tlm1) {
-            e = a * coeff3[idx].y + b * coeff3[idx+1].y + 
-                ((a*a*a-a)*coeff4[idx].y + (b*b*b-b)*coeff4[idx+1].y) * 
+            e = a * coeff3[idx].y + b * coeff3[idx+1].y +
+                ((a*a*a-a)*coeff4[idx].y + (b*b*b-b)*coeff4[idx+1].y) *
                   coeff2[mtype].z;
-          }  
+          }
           energy+=factor_lj*e;
         }
         if (vflag>0) {
@@ -545,23 +545,23 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_table_spline_fast(const __global numtyp4 *x_, 
+__kernel void k_table_spline_fast(const __global numtyp4 *x_,
                                   const __global int *tabindex,
-                                  const __global numtyp4* coeff2, 
+                                  const __global numtyp4* coeff2,
                                   const __global numtyp4 *coeff3,
                                   const __global numtyp4 *coeff4,
                                   const __global numtyp *cutsq_in,
-                                  const __global numtyp* sp_lj_in, 
-                                  const __global int *dev_nbor, 
-                                  const __global int *dev_packed, 
-                                  __global acctyp4 *ans, 
-                                  __global acctyp *engv, 
-                                  const int eflag, const int vflag, 
-                                  const int inum, const int nbor_pitch, 
+                                  const __global numtyp* sp_lj_in,
+                                  const __global int *dev_nbor,
+                                  const __global int *dev_packed,
+                                  __global acctyp4 *ans,
+                                  __global acctyp *engv,
+                                  const int eflag, const int vflag,
+                                  const int inum, const int nbor_pitch,
                                   const int t_per_atom, int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -569,7 +569,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=cutsq_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -578,9 +578,9 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   int tlm1 = tablength - 1;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -591,10 +591,10 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-    
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -602,13 +602,13 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
       int tbindex = tabindex[mtype];
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-          
+
       if (rsq<cutsq[mtype]) {
         int itable=0,idx;
         numtyp a = (numtyp)0;
@@ -620,12 +620,12 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
           idx = itable + tbindex*tablength;
           b = (rsq - coeff3[idx].x) * coeff2[mtype].y;
           a = (numtyp)1.0 - b;
-          value = a * coeff3[idx].z + b * coeff3[idx+1].z + 
-            ((a*a*a-a)*coeff4[idx].z + (b*b*b-b)*coeff4[idx+1].z) * 
+          value = a * coeff3[idx].z + b * coeff3[idx+1].z +
+            ((a*a*a-a)*coeff4[idx].z + (b*b*b-b)*coeff4[idx+1].z) *
                   coeff2[mtype].z;
           force = factor_lj * value;
         } else force = (numtyp)0.0;
-              
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
@@ -633,10 +633,10 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
         if (eflag>0) {
           numtyp e = (numtyp)0.0;
           if (itable < tlm1) {
-            e = a * coeff3[idx].y + b * coeff3[idx+1].y + 
-                ((a*a*a-a)*coeff4[idx].y + (b*b*b-b)*coeff4[idx+1].y) * 
+            e = a * coeff3[idx].y + b * coeff3[idx+1].y +
+                ((a*a*a-a)*coeff4[idx].y + (b*b*b-b)*coeff4[idx+1].y) *
                   coeff2[mtype].z;
-          }  
+          }
           energy+=factor_lj*e;
         }
         if (vflag>0) {
@@ -657,41 +657,41 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
 
 /// ---------------- BITMAP -------------------------------------------------
 
-__kernel void k_table_bitmap(const __global numtyp4 *x_, 
+__kernel void k_table_bitmap(const __global numtyp4 *x_,
                              const __global int *tabindex,
-                             const __global int *nshiftbits, 
+                             const __global int *nshiftbits,
                              const __global int *nmask,
-                             const __global numtyp4* coeff2, 
+                             const __global numtyp4* coeff2,
                              const __global numtyp4 *coeff3,
                              const __global numtyp4 *coeff4,
                              const int lj_types,
                              const __global numtyp *cutsq,
-                             const __global numtyp* sp_lj_in, 
-                             const __global int *dev_nbor, 
-                             const __global int *dev_packed, 
-                             __global acctyp4 *ans, 
-                             __global acctyp *engv, 
-                             const int eflag, const int vflag, const int inum, 
-                             const int nbor_pitch, const int t_per_atom, 
+                             const __global numtyp* sp_lj_in,
+                             const __global int *dev_nbor,
+                             const __global int *dev_packed,
+                             __global acctyp4 *ans,
+                             __global acctyp *engv,
+                             const int eflag, const int vflag, const int inum,
+                             const int nbor_pitch, const int t_per_atom,
                              int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp sp_lj[4];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   int tlm1 = tablength - 1;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -701,10 +701,10 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
-    
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -712,19 +712,19 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype*lj_types+jx.w;
       int tbindex = tabindex[mtype];
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-          
+
       if (rsq<cutsq[mtype]) {
         int itable=0,idx;
         numtyp fraction=(numtyp)0;
         numtyp value = (numtyp)0;
         numtyp force = (numtyp)0;
-        union_int_float rsq_lookup; 
+        union_int_float rsq_lookup;
         rsq_lookup.f = rsq;
         itable = rsq_lookup.i & nmask[mtype];
         itable >>= nshiftbits[mtype];
@@ -734,14 +734,14 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
           value = coeff3[idx].z + fraction*coeff4[idx].z;
           force = factor_lj * value;
         } else force = (numtyp)0.0;
-          
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e = (numtyp)0.0;
-          if (itable <= tlm1) 
+          if (itable <= tlm1)
             e = coeff3[idx].y + fraction*coeff4[idx].y;
           energy+=factor_lj*e;
         }
@@ -761,25 +761,25 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
   } // if ii
 }
 
-__kernel void k_table_bitmap_fast(const __global numtyp4 *x_, 
+__kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
                                   const __global int *tabindex,
-                                  const __global int *nshiftbits, 
+                                  const __global int *nshiftbits,
                                   const __global int *nmask,
-                                  const __global numtyp4* coeff2, 
+                                  const __global numtyp4* coeff2,
                                   const __global numtyp4 *coeff3,
                                   const __global numtyp4 *coeff4,
                                   const __global numtyp *cutsq_in,
-                                  const __global numtyp* sp_lj_in, 
-                                  const __global int *dev_nbor, 
-                                  const __global int *dev_packed, 
-                                  __global acctyp4 *ans, 
-                                  __global acctyp *engv, 
-                                  const int eflag, const int vflag, 
-                                  const int inum, const int nbor_pitch, 
+                                  const __global numtyp* sp_lj_in,
+                                  const __global int *dev_nbor,
+                                  const __global int *dev_packed,
+                                  __global acctyp4 *ans,
+                                  __global acctyp *engv,
+                                  const int eflag, const int vflag,
+                                  const int inum, const int nbor_pitch,
                                   const int t_per_atom, int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -787,18 +787,18 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=cutsq_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   __syncthreads();
-  
+
   int tlm1 = tablength - 1;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -809,10 +809,10 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-    
+
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -820,19 +820,19 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int mtype=itype+jx.w;
       int tbindex = tabindex[mtype];
-      
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-          
+
       if (rsq<cutsq[mtype]) {
         int itable=0,idx;
         numtyp fraction=(numtyp)0;
         numtyp value = (numtyp)0;
         numtyp force = (numtyp)0;
-        union_int_float rsq_lookup; 
+        union_int_float rsq_lookup;
         rsq_lookup.f = rsq;
         itable = rsq_lookup.i & nmask[mtype];
         itable >>= nshiftbits[mtype];
@@ -842,14 +842,14 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
           value = coeff3[idx].z + fraction*coeff4[idx].z;
           force = factor_lj * value;
         } else force = (numtyp)0.0;
-          
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e = (numtyp)0.0;
-          if (itable <= tlm1) 
+          if (itable <= tlm1)
             e = coeff3[idx].y + fraction*coeff4[idx].y;
           energy+=factor_lj*e;
         }
diff --git a/lib/gpu/lal_table.h b/lib/gpu/lal_table.h
index 0e04737d27..f667336679 100644
--- a/lib/gpu/lal_table.h
+++ b/lib/gpu/lal_table.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Table : public BaseAtomic<numtyp, acctyp> {
  public:
   Table();
-  ~Table(); 
+  ~Table();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -38,10 +38,10 @@ class Table : public BaseAtomic<numtyp, acctyp> {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double** cutsq, double ***host_table_coeffs,
-           double **host_table_data, 
+           double **host_table_data,
            double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen,
            int tabstyle, int ntables, int tablength);
 
@@ -54,42 +54,42 @@ class Table : public BaseAtomic<numtyp, acctyp> {
 
   /// Total host memory used by library for pair style
   double host_memory_usage() const;
-  
+
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Kernel k_pair_linear, k_pair_linear_fast;
   UCL_Kernel k_pair_spline, k_pair_spline_fast;
   UCL_Kernel k_pair_bitmap, k_pair_bitmap_fast;
-  
+
   // --------------------------- TYPE DATA --------------------------
 
   UCL_D_Vec<int> tabindex, nshiftbits, nmask;
-  
-  /// coeff2.x = innersq, coeff2.y = invdelta, coeff2.z = deltasq6, 
+
+  /// coeff2.x = innersq, coeff2.y = invdelta, coeff2.z = deltasq6,
   UCL_D_Vec<numtyp4> coeff2;
-  
+
   /// coeff3.x = rsq, coeff3.y = e, coeff3.z = f
   UCL_D_Vec<numtyp4> coeff3;
-  
+
   /// coeff4.x = de, coeff4.y = df
   UCL_D_Vec<numtyp4> coeff4;
-  
+
   UCL_D_Vec<numtyp> cutsq;
-  
+
   /// Special LJ values
   UCL_D_Vec<numtyp> sp_lj;
 
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
-  
+
   /// Table style, length and number of tables
   int _tabstyle,_tablength,_ntables;
-  
+
  private:
   bool _allocated, _compiled_styles;
-  
+
   void loop(const bool _eflag, const bool _vflag);
 };
 
diff --git a/lib/gpu/lal_table_ext.cpp b/lib/gpu/lal_table_ext.cpp
index 172acb7d39..a2b5c61e74 100644
--- a/lib/gpu/lal_table_ext.cpp
+++ b/lib/gpu/lal_table_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -30,7 +30,7 @@ static Table<PRECISION,ACC_PRECISION> TBMF;
 int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
                  double **table_data, double *special_lj, const int inum,
                  const int nall, const int max_nbors,  const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen, 
+                 const double cell_size, int &gpu_mode, FILE *screen,
                  int tabstyle, int ntables, int tablength) {
   TBMF.clear();
   gpu_mode=TBMF.device->gpu_mode();
@@ -55,7 +55,7 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
   int init_ok=0;
   if (world_me==0)
     init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data,
-                      special_lj, inum, nall, 300, maxspecial, cell_size, 
+                      special_lj, inum, nall, 300, maxspecial, cell_size,
                       gpu_split, screen, tabstyle, ntables, tablength);
 
   TBMF.device->world_barrier();
@@ -73,11 +73,11 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data,
-                      special_lj, inum, nall, 300, maxspecial, cell_size, 
+                      special_lj, inum, nall, 300, maxspecial, cell_size,
                       gpu_split, screen, tabstyle, ntables, tablength);
 
     TBMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,8 +102,8 @@ int ** table_gpu_compute_n(const int ago, const int inum_full,
   return TBMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void table_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp
index bc89c53765..bf634cffc2 100644
--- a/lib/gpu/lal_tersoff.cpp
+++ b/lib/gpu/lal_tersoff.cpp
@@ -269,7 +269,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall,
   else
     _eflag=0;
 
-  int ainum=nall;
+  int ainum=nlist;
   int nbor_pitch=this->nbor->nbor_pitch();
   int BX=this->block_pair();
   int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
@@ -279,7 +279,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall,
   this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
                    &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
 
   int evatom=0;
   if (eatom || vatom)
@@ -364,7 +364,7 @@ int ** TersoffT::compute(const int ago, const int inum_full,
   this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
                    &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
 
   int evatom=0;
   if (eatom || vatom)
@@ -437,16 +437,18 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
 
   } else {
     this->k_three_end.set_size(GX,BX);
     this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
   }
 
   this->time_pair.stop();
diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu
index e98a454f58..b7d48d9e34 100644
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@@ -184,7 +184,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
                              __global acctyp4 * zetaij,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
-                             const int eflag, const int nall, const int inum,
+                             const int eflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
   __local int tpa_sq,n_stride;
   tpa_sq = fast_mul(t_per_atom,t_per_atom);
@@ -210,7 +210,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
 
   __syncthreads();
 
-  if (ii<nall) {
+  if (ii<inum) {
     int nbor_j, nbor_end;
     int i, numj;
 
@@ -597,11 +597,12 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
                                   const __global acctyp4 *restrict zetaij,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
+                                  const __global int * dev_acc,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum,  const int nbor_pitch,
-                                  const int t_per_atom) {
+                                  const int t_per_atom, const int gpu_nbor) {
   __local int tpa_sq, n_stride;
   tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@@ -666,13 +667,17 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
       if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
         nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
         k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
         nbor_k+=offset_k;
       } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
         nbor_k+=nbor_pitch;
         nbor_k=dev_nbor[nbor_k];
         k_end=nbor_k+numk;
@@ -810,7 +815,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
 __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global numtyp4 *restrict ts1_in,
                                         const __global numtyp4 *restrict ts2_in,
-      	                                const __global numtyp4 *restrict ts4_in,
+                                        const __global numtyp4 *restrict ts4_in,
                                         const __global numtyp *restrict cutsq,
                                         const __global int *restrict map,
                                         const __global int *restrict elem2param,
@@ -818,11 +823,12 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global acctyp4 *restrict zetaij,
                                         const __global int * dev_nbor,
                                         const __global int * dev_packed,
+                                        const __global int * dev_acc,
                                         __global acctyp4 *restrict ans,
                                         __global acctyp *restrict engv,
                                         const int eflag, const int vflag,
                                         const int inum,  const int nbor_pitch,
-                                        const int t_per_atom) {
+                                        const int t_per_atom, const int gpu_nbor) {
   __local int tpa_sq, n_stride;
   tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@@ -887,13 +893,17 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
       if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
         nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
         k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
         nbor_k+=offset_k;
       } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
         nbor_k+=nbor_pitch;
         nbor_k=dev_nbor[nbor_k];
         k_end=nbor_k+numk;
@@ -964,7 +974,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
 
         numtyp delr2[3];
         delr2[0] = kx.x-jx.x;
-      	delr2[1] = kx.y-jx.y;
+        delr2[1] = kx.y-jx.y;
         delr2[2] = kx.z-jx.z;
         numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
 
diff --git a/lib/gpu/lal_tersoff_ext.cpp b/lib/gpu/lal_tersoff_ext.cpp
index a608a57179..e6dc539035 100644
--- a/lib/gpu/lal_tersoff_ext.cpp
+++ b/lib/gpu/lal_tersoff_ext.cpp
@@ -118,7 +118,7 @@ int ** tersoff_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }
-			
+
 void tersoff_gpu_compute(const int ago, const int nlocal, const int nall,
                     const int nlist, double **host_x, int *host_type,
                     int *ilist, int *numj, int **firstneigh, const bool eflag,
diff --git a/lib/gpu/lal_tersoff_extra.h b/lib/gpu/lal_tersoff_extra.h
index 21a0315f71..47d16678f0 100644
--- a/lib/gpu/lal_tersoff_extra.h
+++ b/lib/gpu/lal_tersoff_extra.h
@@ -186,7 +186,7 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
   if (tmp > param_c2)
     return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
     // error in negligible 2nd term fixed 9/30/2015
-		// (1.0 - 0.5*(1.0 +  1.0/(2.0*param->powern)) *
+                // (1.0 - 0.5*(1.0 +  1.0/(2.0*param->powern)) *
       ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) *
        ucl_powr(tmp,-param_powern)));
   if (tmp < param_c4) return (numtyp)0.0;
diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp
index bfcc9c3bd3..a01bcf63b1 100644
--- a/lib/gpu/lal_tersoff_mod.cpp
+++ b/lib/gpu/lal_tersoff_mod.cpp
@@ -269,7 +269,7 @@ void TersoffMT::compute(const int f_ago, const int nlocal, const int nall,
   else
     _eflag=0;
 
-  int ainum=nall;
+  int ainum=nlist;
   int nbor_pitch=this->nbor->nbor_pitch();
   int BX=this->block_pair();
   int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
@@ -279,7 +279,7 @@ void TersoffMT::compute(const int f_ago, const int nlocal, const int nall,
   this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
                    &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
 
   int evatom=0;
   if (eatom || vatom)
@@ -364,7 +364,7 @@ int ** TersoffMT::compute(const int ago, const int inum_full,
   this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
                    &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
 
   int evatom=0;
   if (eatom || vatom)
@@ -437,16 +437,18 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
 
   } else {
     this->k_three_end.set_size(GX,BX);
     this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
   }
 
   this->time_pair.stop();
diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu
index ba4ad32005..3a81b36941 100644
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@@ -184,7 +184,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
                              __global acctyp4 * zetaij,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
-                             const int eflag, const int nall, const int inum,
+                             const int eflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
   __local int tpa_sq,n_stride;
   tpa_sq = fast_mul(t_per_atom,t_per_atom);
@@ -210,7 +210,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
 
   __syncthreads();
 
-  if (ii<nall) {
+  if (ii<inum) {
     int nbor_j, nbor_end;
     int i, numj;
 
@@ -605,11 +605,12 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
                                   const __global acctyp4 *restrict zetaij,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
+                                  const __global int * dev_acc,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum,  const int nbor_pitch,
-                                  const int t_per_atom) {
+                                  const int t_per_atom, const int gpu_nbor) {
   __local int tpa_sq, n_stride;
   tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
@@ -676,13 +677,17 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
       if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
         nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
         k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
         nbor_k+=offset_k;
       } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
         nbor_k+=nbor_pitch;
         nbor_k=dev_nbor[nbor_k];
         k_end=nbor_k+numk;
@@ -826,8 +831,8 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
 __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global numtyp4 *restrict ts1_in,
                                         const __global numtyp4 *restrict ts2_in,
-      	                                const __global numtyp4 *restrict ts4_in,
-      	                                const __global numtyp4 *restrict ts5_in,
+                                        const __global numtyp4 *restrict ts4_in,
+                                        const __global numtyp4 *restrict ts5_in,
                                         const __global numtyp *restrict cutsq,
                                         const __global int *restrict map,
                                         const __global int *restrict elem2param,
@@ -835,11 +840,12 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global acctyp4 *restrict zetaij,
                                         const __global int * dev_nbor,
                                         const __global int * dev_packed,
+                                        const __global int * dev_acc,
                                         __global acctyp4 *restrict ans,
                                         __global acctyp *restrict engv,
                                         const int eflag, const int vflag,
                                         const int inum,  const int nbor_pitch,
-                                        const int t_per_atom) {
+                                        const int t_per_atom, const int gpu_nbor) {
   __local int tpa_sq, n_stride;
   tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
@@ -906,13 +912,17 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
       if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
         nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
         k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
         nbor_k+=offset_k;
       } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
         nbor_k+=nbor_pitch;
         nbor_k=dev_nbor[nbor_k];
         k_end=nbor_k+numk;
@@ -983,7 +993,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
 
         numtyp delr2[3];
         delr2[0] = kx.x-jx.x;
-      	delr2[1] = kx.y-jx.y;
+        delr2[1] = kx.y-jx.y;
         delr2[2] = kx.z-jx.z;
         numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
 
diff --git a/lib/gpu/lal_tersoff_mod_ext.cpp b/lib/gpu/lal_tersoff_mod_ext.cpp
index 7817e7d08d..7f0af3fce9 100644
--- a/lib/gpu/lal_tersoff_mod_ext.cpp
+++ b/lib/gpu/lal_tersoff_mod_ext.cpp
@@ -118,7 +118,7 @@ int ** tersoff_mod_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }
-			
+
 void tersoff_mod_gpu_compute(const int ago, const int nlocal, const int nall,
                     const int nlist, double **host_x, int *host_type,
                     int *ilist, int *numj, int **firstneigh, const bool eflag,
diff --git a/lib/gpu/lal_tersoff_mod_extra.h b/lib/gpu/lal_tersoff_mod_extra.h
index 370aceb634..a130d98488 100644
--- a/lib/gpu/lal_tersoff_mod_extra.h
+++ b/lib/gpu/lal_tersoff_mod_extra.h
@@ -180,12 +180,12 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
 {
   numtyp tmp = param_beta * zeta;
   if (tmp > param_ca1) return (numtyp)-0.5*(param_powern/param_powern_del) *
-	  ucl_powr(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta;
+          ucl_powr(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta;
   if (tmp < param_ca4) return (numtyp)0.0;
 
   numtyp tmp_n = ucl_powr(tmp,param_powern);
   return (numtyp)-0.5 *(param_powern/param_powern_del) *
-	  ucl_powr((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 /
+          ucl_powr((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 /
     ((numtyp)2.0*param_powern_del)))*tmp_n / zeta;
 }
 
diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp
index 57688f53ab..c1f3f25c04 100644
--- a/lib/gpu/lal_tersoff_zbl.cpp
+++ b/lib/gpu/lal_tersoff_zbl.cpp
@@ -294,7 +294,7 @@ void TersoffZT::compute(const int f_ago, const int nlocal, const int nall,
   else
     _eflag=0;
 
-  int ainum=nall;
+  int ainum=nlist;
   int nbor_pitch=this->nbor->nbor_pitch();
   int BX=this->block_pair();
   int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
@@ -304,7 +304,7 @@ void TersoffZT::compute(const int f_ago, const int nlocal, const int nall,
   this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
                    &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
 
   int evatom=0;
   if (eatom || vatom)
@@ -389,7 +389,7 @@ int ** TersoffZT::compute(const int ago, const int inum_full,
   this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
                    &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
 
   int evatom=0;
   if (eatom || vatom)
@@ -463,16 +463,18 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
 
   } else {
     this->k_three_end.set_size(GX,BX);
     this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
   }
 
   this->time_pair.stop();
diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu
index 0d6c5a38d6..9509b9802c 100644
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@@ -188,7 +188,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
                              __global acctyp4 * zetaij,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
-                             const int eflag, const int nall, const int inum,
+                             const int eflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
   __local int tpa_sq,n_stride;
   tpa_sq = fast_mul(t_per_atom,t_per_atom);
@@ -216,7 +216,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
 
   __syncthreads();
 
-  if (ii<nall) {
+  if (ii<inum) {
     int nbor_j, nbor_end;
     int i, numj;
 
@@ -617,11 +617,12 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
                                   const __global acctyp4 *restrict zetaij,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
+                                  const __global int * dev_acc,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum,  const int nbor_pitch,
-                                  const int t_per_atom) {
+                                  const int t_per_atom, const int gpu_nbor) {
   __local int tpa_sq, n_stride;
   tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@@ -686,13 +687,17 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
       if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
         nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
         k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
         nbor_k+=offset_k;
       } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
         nbor_k+=nbor_pitch;
         nbor_k=dev_nbor[nbor_k];
         k_end=nbor_k+numk;
@@ -830,7 +835,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
 __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global numtyp4 *restrict ts1_in,
                                         const __global numtyp4 *restrict ts2_in,
-      	                                const __global numtyp4 *restrict ts4_in,
+                                        const __global numtyp4 *restrict ts4_in,
                                         const __global numtyp *restrict cutsq,
                                         const __global int *restrict map,
                                         const __global int *restrict elem2param,
@@ -838,11 +843,12 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global acctyp4 *restrict zetaij,
                                         const __global int * dev_nbor,
                                         const __global int * dev_packed,
+                                        const __global int * dev_acc,
                                         __global acctyp4 *restrict ans,
                                         __global acctyp *restrict engv,
                                         const int eflag, const int vflag,
                                         const int inum,  const int nbor_pitch,
-                                        const int t_per_atom) {
+                                        const int t_per_atom, const int gpu_nbor) {
   __local int tpa_sq, n_stride;
   tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@@ -907,13 +913,17 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
       if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
         nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
         k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
         nbor_k+=offset_k;
       } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
         nbor_k+=nbor_pitch;
         nbor_k=dev_nbor[nbor_k];
         k_end=nbor_k+numk;
@@ -984,7 +994,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
 
         numtyp delr2[3];
         delr2[0] = kx.x-jx.x;
-      	delr2[1] = kx.y-jx.y;
+        delr2[1] = kx.y-jx.y;
         delr2[2] = kx.z-jx.z;
         numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
 
diff --git a/lib/gpu/lal_tersoff_zbl_ext.cpp b/lib/gpu/lal_tersoff_zbl_ext.cpp
index fce240f8fe..ccfcf6839d 100644
--- a/lib/gpu/lal_tersoff_zbl_ext.cpp
+++ b/lib/gpu/lal_tersoff_zbl_ext.cpp
@@ -129,7 +129,7 @@ int ** tersoff_zbl_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }
-			
+
 void tersoff_zbl_gpu_compute(const int ago, const int nlocal, const int nall,
                     const int nlist, double **host_x, int *host_type,
                     int *ilist, int *numj, int **firstneigh, const bool eflag,
diff --git a/lib/gpu/lal_tersoff_zbl_extra.h b/lib/gpu/lal_tersoff_zbl_extra.h
index 79afb4de82..32c05a3716 100644
--- a/lib/gpu/lal_tersoff_zbl_extra.h
+++ b/lib/gpu/lal_tersoff_zbl_extra.h
@@ -212,7 +212,7 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
   if (tmp > param_c2)
     return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
     // error in negligible 2nd term fixed 9/30/2015
-		// (1.0 - 0.5*(1.0 +  1.0/(2.0*param->powern)) *
+                // (1.0 - 0.5*(1.0 +  1.0/(2.0*param->powern)) *
       ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) *
        ucl_powr(tmp,-param_powern)));
   if (tmp < param_c4) return (numtyp)0.0;
diff --git a/lib/gpu/lal_yukawa.cpp b/lib/gpu/lal_yukawa.cpp
index 585dc069a0..a316d195ac 100644
--- a/lib/gpu/lal_yukawa.cpp
+++ b/lib/gpu/lal_yukawa.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -33,19 +33,19 @@ YukawaT::Yukawa() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-YukawaT::~Yukawa() { 
+YukawaT::~Yukawa() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int YukawaT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int YukawaT::init(const int ntypes, 
+int YukawaT::init(const int ntypes,
                   double **host_cutsq, double kappa,
-                  double **host_a, double **host_offset, 
+                  double **host_a, double **host_offset,
                   double *host_special_lj, const int nlocal,
                   const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size,
@@ -75,7 +75,7 @@ int YukawaT::init(const int ntypes,
 
   coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,host_offset,
-			 host_cutsq);
+                         host_cutsq);
 
   UCL_H_Vec<double> dview;
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@@ -83,7 +83,7 @@ int YukawaT::init(const int ntypes,
   ucl_copy(sp_lj,dview,false);
 
   _kappa = kappa;
-  
+
   _allocated=true;
   this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes();
   return 0;
@@ -122,7 +122,7 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -134,7 +134,7 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->x, &coeff, &_kappa, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch, 
+                          &vflag, &ainum, &nbor_pitch,
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu
index b0c3b9978d..a8d637ec97 100644
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -24,14 +24,14 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_yukawa(const __global numtyp4 *restrict x_, 
+__kernel void k_yukawa(const __global numtyp4 *restrict x_,
                        const __global numtyp4 *restrict coeff,
                        const numtyp kappa, const int lj_types,
-                       const __global numtyp *restrict sp_lj_in, 
-                       const __global int *dev_nbor, 
-                       const __global int *dev_packed, 
+                       const __global numtyp *restrict sp_lj_in,
+                       const __global int *dev_nbor,
+                       const __global int *dev_packed,
                        __global acctyp4 *restrict ans,
-                       __global acctyp *restrict engv, 
+                       __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -49,20 +49,20 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -75,7 +75,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<coeff[mtype].z) {
         numtyp r2inv = ucl_recip(rsq);
@@ -84,14 +84,14 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
         numtyp screening = exp(-kappa*r);
         numtyp force = coeff[mtype].x*screening*(kappa + rinv)*r2inv;
         force*=factor_lj;
-      
+
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=coeff[mtype].x*screening*rinv;
-          energy+=factor_lj*(e-coeff[mtype].y); 
+          energy+=factor_lj*(e-coeff[mtype].y);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -109,19 +109,19 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_yukawa_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
                             const __global numtyp4 *restrict coeff_in,
-                            const numtyp kappa, 
-                            const __global numtyp *restrict sp_lj_in, 
-                            const __global int *dev_nbor, 
-                            const __global int *dev_packed, 
-                            __global acctyp4 *restrict ans, 
-                            __global acctyp *restrict engv, 
-                            const int eflag, const int vflag, const int inum, 
+                            const numtyp kappa,
+                            const __global numtyp *restrict sp_lj_in,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv,
+                            const int eflag, const int vflag, const int inum,
                             const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -129,7 +129,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -138,7 +138,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -152,7 +152,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -165,7 +165,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<coeff[mtype].z) {
         numtyp r2inv = ucl_recip(rsq);
         numtyp r = ucl_sqrt(rsq);
@@ -180,7 +180,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
 
         if (eflag>0) {
           numtyp e=coeff[mtype].x*screening*rinv;
-          energy+=factor_lj*(e-coeff[mtype].y); 
+          energy+=factor_lj*(e-coeff[mtype].y);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_yukawa.h b/lib/gpu/lal_yukawa.h
index 720dc903d0..4cc23c03e9 100644
--- a/lib/gpu/lal_yukawa.h
+++ b/lib/gpu/lal_yukawa.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Yukawa : public BaseAtomic<numtyp, acctyp> {
  public:
   Yukawa();
-  ~Yukawa(); 
+  ~Yukawa();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -39,8 +39,8 @@ class Yukawa : public BaseAtomic<numtyp, acctyp> {
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq, double kappa,
            double **host_a, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
@@ -57,16 +57,16 @@ class Yukawa : public BaseAtomic<numtyp, acctyp> {
 
   /// coeff.x = a, coeff.y = offset, coeff.z = cutsq
   UCL_D_Vec<numtyp4> coeff;
-  
+
   /// Special LJ values
   UCL_D_Vec<numtyp> sp_lj;
 
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
-  
+
   /// kappa
   numtyp _kappa;
 
diff --git a/lib/gpu/lal_yukawa_colloid.cpp b/lib/gpu/lal_yukawa_colloid.cpp
index 70282a7117..af29938a68 100644
--- a/lib/gpu/lal_yukawa_colloid.cpp
+++ b/lib/gpu/lal_yukawa_colloid.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -29,23 +29,23 @@ using namespace LAMMPS_AL;
 extern Device<PRECISION,ACC_PRECISION> device;
 
 template <class numtyp, class acctyp>
-YukawaColloidT::YukawaColloid() : BaseAtomic<numtyp,acctyp>(), 
+YukawaColloidT::YukawaColloid() : BaseAtomic<numtyp,acctyp>(),
 _max_rad_size(0), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-YukawaColloidT::~YukawaColloid() { 
+YukawaColloidT::~YukawaColloid() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int YukawaColloidT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-int YukawaColloidT::init(const int ntypes, 
-                   double **host_cutsq, double **host_a, 
+int YukawaColloidT::init(const int ntypes,
+                   double **host_cutsq, double **host_a,
                    double **host_offset, double *host_special_lj, const int nlocal,
                    const int nall, const int max_nbors,
                    const int maxspecial, const double cell_size,
@@ -62,16 +62,16 @@ int YukawaColloidT::init(const int ntypes,
     _shared_view=false;
 
   // allocate rad
-  
+
   int ef_nall=nall;
   if (ef_nall==0)
     ef_nall=2000;
-  
+
   _max_rad_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
-    
+
   if (_shared_view==false)
     c_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
-  
+
   rad_tex.get_texture(*(this->pair_program),"rad_tex");
   rad_tex.bind_float(c_rad,1);
 
@@ -96,13 +96,13 @@ int YukawaColloidT::init(const int ntypes,
 
   coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,
-			 host_offset,host_cutsq);
+                         host_offset,host_cutsq);
 
   UCL_H_Vec<double> dview;
   sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
   dview.view(host_special_lj,4,*(this->ucl_device));
   ucl_copy(sp_lj,dview,false);
-  
+
   _allocated=true;
   this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes();
   return 0;
@@ -131,15 +131,15 @@ double YukawaColloidT::host_memory_usage() const {
 // Copy nbor list from host if necessary and then compute atom energies/forces
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void YukawaColloidT::compute(const int f_ago, const int inum_full, 
-               const int nall, double **host_x, int *host_type, int *ilist, 
+void YukawaColloidT::compute(const int f_ago, const int inum_full,
+               const int nall, double **host_x, int *host_type, int *ilist,
                int *numj, int **firstneigh, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                const double cpu_time, bool &success, double *rad) {
   this->acc_timers();
-  
+
   // ------------------- Resize rad array --------------------------
-  
+
   if (nall>_max_rad_size) {
     _max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
     if (_shared_view==false) {
@@ -157,7 +157,7 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full,
     this->zero_timers();
     return;
   }
-  
+
   int ago=this->hd_balancer.ago_first(f_ago);
   int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
   this->ans->inum(inum);
@@ -170,7 +170,7 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full,
     if (!success)
       return;
   }
-  
+
   this->atom->cast_x_data(host_x,host_type);
   this->cast_rad_data(rad);
   this->hd_balancer.start_timer();
@@ -182,7 +182,7 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full,
   this->device->add_ans_object(this->ans);
   this->hd_balancer.stop_timer();
 }
-               
+
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU and then compute per-atom densities
 // ---------------------------------------------------------------------------
@@ -190,24 +190,24 @@ template <class numtyp, class acctyp>
 int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **jnum, const double cpu_time, bool &success,
                 double *rad) {
   this->acc_timers();
-  
+
   // ------------------- Resize rad array ----------------------------
-  
+
   if (nall>_max_rad_size) {
     _max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
     if (_shared_view==false) {
       c_rad.resize(_max_rad_size);
       rad_tex.bind_float(c_rad,1);
     }
-  }      
+  }
 
   // -----------------------------------------------------------------
-  
+
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -215,21 +215,21 @@ int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall
     this->zero_timers();
     return NULL;
   }
-  
+
   // load balance, returning the atom count on the device (inum)
   this->hd_balancer.balance(cpu_time);
   int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
   this->ans->inum(inum);
   host_start=inum;
- 
-  // Build neighbor list on GPU if necessary 
+
+  // Build neighbor list on GPU if necessary
   if (ago==0) {
     this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                           sublo, subhi, tag, nspecial, special, success);
     if (!success)
       return NULL;
     this->cast_rad_data(rad);
-    this->hd_balancer.start_timer();  
+    this->hd_balancer.start_timer();
   } else {
     this->atom->cast_x_data(host_x,host_type);
     this->cast_rad_data(rad);
@@ -265,7 +265,7 @@ void YukawaColloidT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -280,8 +280,8 @@ void YukawaColloidT::loop(const bool _eflag, const bool _vflag) {
                           &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &c_rad, &coeff, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &c_rad, &coeff, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
   }
diff --git a/lib/gpu/lal_yukawa_colloid.cu b/lib/gpu/lal_yukawa_colloid.cu
index f9f4767123..48ab47bc94 100644
--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 
@@ -29,15 +29,15 @@ texture<int2> rad_tex;
 #define rad_tex rad_
 #endif
 
-__kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_, 
+__kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict rad_,
-                               const __global numtyp4 *restrict coeff, 
-                               const int lj_types, 
-                               const __global numtyp *restrict sp_lj_in, 
-                               const __global int *dev_nbor, 
-                               const __global int *dev_packed, 
+                               const __global numtyp4 *restrict coeff,
+                               const int lj_types,
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor,
+                               const __global int *dev_packed,
                                __global acctyp4 *restrict ans,
-                               __global acctyp *restrict engv, 
+                               __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch, const int t_per_atom,
                                const numtyp kappa) {
@@ -56,21 +56,21 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp radi; fetch(radi,i,rad_tex);
     int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -78,29 +78,29 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       numtyp radj; fetch(radj,j,rad_tex);
       int jtype=jx.w;
-  
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
-      if (rsq<coeff[mtype].z) {   
+      if (rsq<coeff[mtype].z) {
         numtyp r = ucl_sqrt(rsq);
         numtyp rinv = ucl_recip(r);
-	      numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
-	      numtyp force = coeff[mtype].x * screening;
+              numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
+              numtyp force = coeff[mtype].x * screening;
+
+              force = factor_lj*force * rinv;
 
-	      force = factor_lj*force * rinv;
-  
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=coeff[mtype].x/kappa * screening;
-          energy+=factor_lj*(e-coeff[mtype].y); 
+          energy+=factor_lj*(e-coeff[mtype].y);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -118,20 +118,20 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
                                     const __global numtyp *restrict rad_,
-                                    const __global numtyp4 *restrict coeff_in, 
+                                    const __global numtyp4 *restrict coeff_in,
                                     const __global numtyp *restrict sp_lj_in,
-                                    const __global int *dev_nbor, 
-                                    const __global int *dev_packed, 
-                                    __global acctyp4 *restrict ans, 
-                                    __global acctyp *restrict engv, 
-                                    const int eflag, const int vflag, 
-                                    const int inum, const int nbor_pitch, 
+                                    const __global int *dev_nbor,
+                                    const __global int *dev_packed,
+                                    __global acctyp4 *restrict ans,
+                                    __global acctyp *restrict engv,
+                                    const int eflag, const int vflag,
+                                    const int inum, const int nbor_pitch,
                                     const int t_per_atom, const numtyp kappa) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -139,7 +139,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -148,7 +148,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -163,7 +163,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -177,22 +177,22 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<coeff[mtype].z) {
         numtyp r = ucl_sqrt(rsq);
         numtyp rinv = ucl_recip(r);
-	      numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
-	      numtyp force = coeff[mtype].x * screening;
+              numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
+              numtyp force = coeff[mtype].x * screening;
+
+              force = factor_lj*force * rinv;
 
-	      force = factor_lj*force * rinv;
-      
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
         if (eflag>0) {
           numtyp e=coeff[mtype].x/kappa * screening;
-          energy+=factor_lj*(e-coeff[mtype].y); 
+          energy+=factor_lj*(e-coeff[mtype].y);
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_yukawa_colloid.h b/lib/gpu/lal_yukawa_colloid.h
index 5a9ee7ae6e..ba69bc4bae 100644
--- a/lib/gpu/lal_yukawa_colloid.h
+++ b/lib/gpu/lal_yukawa_colloid.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class YukawaColloid : public BaseAtomic<numtyp, acctyp> {
  public:
   YukawaColloid();
-  ~YukawaColloid(); 
+  ~YukawaColloid();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
@@ -39,8 +39,8 @@ class YukawaColloid : public BaseAtomic<numtyp, acctyp> {
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, double **host_cutsq,
            double **host_a, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen, const double kappa);
 
   inline void cast_rad_data(double* rad) {
@@ -70,22 +70,22 @@ class YukawaColloid : public BaseAtomic<numtyp, acctyp> {
 
   /// Total host memory used by library for pair style
   double host_memory_usage() const;
-  
+
   /// Pair loop with host neighboring
-  void compute(const int f_ago, const int inum_full, 
-               const int nall, double **host_x, int *host_type, 
-               int *ilist, int *numj, int **firstneigh, 
+  void compute(const int f_ago, const int inum_full,
+               const int nall, double **host_x, int *host_type,
+               int *ilist, int *numj, int **firstneigh,
                const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                const double cpu_time, bool &success, double *rad);
-               
+
   /// Pair loop with device neighboring
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
-                int **ilist, int **jnum, const double cpu_time, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **jnum, const double cpu_time,
                 bool &success, double *rad);
 
   // --------------------------- TEXTURES -----------------------------
@@ -101,7 +101,7 @@ class YukawaColloid : public BaseAtomic<numtyp, acctyp> {
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
 
-  /// Number of atom types 
+  /// Number of atom types
   int _lj_types;
 
   int _max_rad_size;
diff --git a/lib/gpu/lal_yukawa_colloid_ext.cpp b/lib/gpu/lal_yukawa_colloid_ext.cpp
index 0e3c653e06..e2b0354d10 100644
--- a/lib/gpu/lal_yukawa_colloid_ext.cpp
+++ b/lib/gpu/lal_yukawa_colloid_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -27,10 +27,10 @@ static YukawaColloid<PRECISION,ACC_PRECISION> YKCOLLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, 
+int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
                        double **host_offset, double *special_lj, const int inum,
                        const int nall, const int max_nbors,  const int maxspecial,
-                       const double cell_size, int &gpu_mode, FILE *screen, 
+                       const double cell_size, int &gpu_mode, FILE *screen,
                        const double kappa) {
   YKCOLLMF.clear();
   gpu_mode=YKCOLLMF.device->gpu_mode();
@@ -54,8 +54,8 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, 
-                          inum, nall, 300, maxspecial, cell_size, gpu_split, 
+    init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj,
+                          inum, nall, 300, maxspecial, cell_size, gpu_split,
                           screen, kappa);
 
   YKCOLLMF.device->world_barrier();
@@ -72,12 +72,12 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, 
-                            inum, nall, 300, maxspecial, cell_size, gpu_split, 
+      init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj,
+                            inum, nall, 300, maxspecial, cell_size, gpu_split,
                             screen, kappa);
 
     YKCOLLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -103,11 +103,11 @@ int ** ykcolloid_gpu_compute_n(const int ago, const int inum_full,
                           subhi, tag, nspecial, special, eflag, vflag, eatom,
                           vatom, host_start, ilist, jnum, cpu_time, success,
                           host_rad);
-}  
-			
-void ykcolloid_gpu_compute(const int ago, const int inum_full, 
-                           const int nall, double **host_x, int *host_type, 
-                           int *ilist, int *numj, int **firstneigh, 
+}
+
+void ykcolloid_gpu_compute(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *ilist, int *numj, int **firstneigh,
                            const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            const double cpu_time, bool &success, double *host_rad) {
diff --git a/lib/gpu/lal_yukawa_ext.cpp b/lib/gpu/lal_yukawa_ext.cpp
index 1cc89885aa..9d38387bc1 100644
--- a/lib/gpu/lal_yukawa_ext.cpp
+++ b/lib/gpu/lal_yukawa_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
@@ -28,9 +28,9 @@ static Yukawa<PRECISION,ACC_PRECISION> YKMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
-                 double **host_a, double **offset, double *special_lj, 
-                 const int inum, const int nall, const int max_nbors,  
-                 const int maxspecial, const double cell_size, 
+                 double **host_a, double **offset, double *special_lj,
+                 const int inum, const int nall, const int max_nbors,
+                 const int maxspecial, const double cell_size,
                  int &gpu_mode, FILE *screen) {
   YKMF.clear();
   gpu_mode=YKMF.device->gpu_mode();
@@ -54,8 +54,8 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, 
-                      inum, nall, 300, maxspecial, cell_size, 
+    init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj,
+                      inum, nall, 300, maxspecial, cell_size,
                       gpu_split, screen);
 
   YKMF.device->world_barrier();
@@ -72,12 +72,12 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, 
-                      inum, nall, 300, maxspecial, cell_size, 
+      init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj,
+                      inum, nall, 300, maxspecial, cell_size,
                       gpu_split, screen);
 
     YKMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -102,8 +102,8 @@ int ** yukawa_gpu_compute_n(const int ago, const int inum_full,
   return YKMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void yukawa_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_zbl.cpp b/lib/gpu/lal_zbl.cpp
index e172d48b33..77e55a62f9 100644
--- a/lib/gpu/lal_zbl.cpp
+++ b/lib/gpu/lal_zbl.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndactrung@gmail.com
  ***************************************************************************/
 
@@ -33,10 +33,10 @@ ZBLT::ZBL() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
-ZBLT::~ZBL() { 
+ZBLT::~ZBL() {
   clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int ZBLT::bytes_per_atom(const int max_nbors) const {
   return this->bytes_per_atom_atomic(max_nbors);
@@ -44,15 +44,15 @@ int ZBLT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int ZBLT::init(const int ntypes, double **host_cutsq,
-               double **host_sw1, double **host_sw2, 
-               double **host_sw3, double **host_sw4, 
+               double **host_sw1, double **host_sw2,
+               double **host_sw3, double **host_sw4,
                double **host_sw5,
-               double **host_d1a, double **host_d2a, 
-               double **host_d3a, double **host_d4a, 
-               double **host_zze, double cut_globalsq, 
+               double **host_d1a, double **host_d2a,
+               double **host_d3a, double **host_d4a,
+               double **host_zze, double cut_globalsq,
                double cut_innersq, double cut_inner,
-               const int nlocal, const int nall, const int max_nbors, 
-               const int maxspecial, const double cell_size, 
+               const int nlocal, const int nall, const int max_nbors,
+               const int maxspecial, const double cell_size,
                const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@@ -79,16 +79,16 @@ int ZBLT::init(const int ntypes, double **host_cutsq,
 
   coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_sw1,host_sw2,
-			                   host_zze, host_cutsq);
+                                           host_zze, host_cutsq);
 
   coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_d1a,host_d2a,
-			                   host_d3a,host_d4a);
+                                           host_d3a,host_d4a);
 
   coeff3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->type_pack4(ntypes,lj_types,coeff3,host_write,host_sw3,host_sw4,host_sw5);
 
-  _cut_globalsq = cut_globalsq; 
+  _cut_globalsq = cut_globalsq;
   _cut_innersq = cut_innersq;
   _cut_inner = cut_inner;
 
@@ -131,7 +131,7 @@ void ZBLT::loop(const bool _eflag, const bool _vflag) {
     vflag=1;
   else
     vflag=0;
-  
+
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
diff --git a/lib/gpu/lal_zbl.cu b/lib/gpu/lal_zbl.cu
index b14753b5fa..b7f379c833 100644
--- a/lib/gpu/lal_zbl.cu
+++ b/lib/gpu/lal_zbl.cu
@@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : ndactrung@gmail.com
 // ***************************************************************************/
 
@@ -35,9 +35,9 @@ texture<int4,1> pos_tex;
    compute ZBL pair energy
 ------------------------------------------------------------------------- */
 
-ucl_inline numtyp e_zbl(numtyp r, numtyp d1aij, numtyp d2aij, 
+ucl_inline numtyp e_zbl(numtyp r, numtyp d1aij, numtyp d2aij,
                       numtyp d3aij, numtyp d4aij, numtyp zzeij) {
-  
+
   numtyp rinv = ucl_recip(r);
 
   numtyp sum = c1*ucl_exp(-d1aij*r);
@@ -54,7 +54,7 @@ ucl_inline numtyp e_zbl(numtyp r, numtyp d1aij, numtyp d2aij,
    compute ZBL first derivative
 ------------------------------------------------------------------------- */
 
-ucl_inline numtyp dzbldr(numtyp r, numtyp d1aij, numtyp d2aij, 
+ucl_inline numtyp dzbldr(numtyp r, numtyp d1aij, numtyp d2aij,
                          numtyp d3aij, numtyp d4aij, numtyp zzeij) {
   numtyp rinv = ucl_recip(r);
 
@@ -72,24 +72,24 @@ ucl_inline numtyp dzbldr(numtyp r, numtyp d1aij, numtyp d2aij,
   sum_p -= c2*d2aij*e2;
   sum_p -= c3*d3aij*e3;
   sum_p -= c4*d4aij*e4;
-  
+
   numtyp result = zzeij*(sum_p - sum*rinv)*rinv;
-  
+
   return result;
 };
 
-__kernel void k_zbl(const __global numtyp4 *restrict x_, 
+__kernel void k_zbl(const __global numtyp4 *restrict x_,
                     const __global numtyp4 *restrict coeff1,
                     const __global numtyp4 *restrict coeff2,
                     const __global numtyp4 *restrict coeff3,
-                    const double cut_globalsq, 
-                    const double cut_innersq, 
-                    const double cut_inner, 
-                    const int lj_types, 
-                    const __global int *dev_nbor, 
-                    const __global int *dev_packed, 
+                    const double cut_globalsq,
+                    const double cut_innersq,
+                    const double cut_inner,
+                    const int lj_types,
+                    const __global int *dev_nbor,
+                    const __global int *dev_packed,
                     __global acctyp4 *restrict ans,
-                    __global acctyp *restrict engv, 
+                    __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -101,19 +101,19 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
-  
+
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
@@ -125,19 +125,19 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       int mtype=itype*lj_types+jtype;
       if (rsq<cut_globalsq) {
         numtyp r, t, force;
 
         r = ucl_sqrt(rsq);
-        force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y, 
+        force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y,
                        coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
-      
-      	if (rsq>cut_innersq) {
-	        t = r - cut_inner;
-	        force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
-      	}
+
+              if (rsq>cut_innersq) {
+                t = r - cut_inner;
+                force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
+              }
 
         force *= (numtyp)-1.0*ucl_recip(r);
 
@@ -146,14 +146,14 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
         f.z+=delz*force;
 
         if (eflag>0) {
-          numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y, 
+          numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
                          coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
-       	  e += coeff3[mtype].z;
-      	  if (rsq > cut_innersq) {
-      	    e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
-      	  }
+                 e += coeff3[mtype].z;
+                if (rsq > cut_innersq) {
+                  e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
+                }
 
-          energy+=e; 
+          energy+=e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
@@ -171,22 +171,22 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_zbl_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict coeff1_in,
                          const __global numtyp4 *restrict coeff2_in,
                          const __global numtyp4 *restrict coeff3_in,
-                         const double cut_globalsq, 
-                         const double cut_innersq, 
-                         const double cut_inner, 
+                         const double cut_globalsq,
+                         const double cut_innersq,
+                         const double cut_inner,
                          const __global int *dev_nbor,
-                         const __global int *dev_packed, 
+                         const __global int *dev_packed,
                          __global acctyp4 *restrict ans,
-                         __global acctyp *restrict engv, 
-                         const int eflag, const int vflag, const int inum, 
+                         __global acctyp *restrict engv,
+                         const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
-  
+
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@@ -195,7 +195,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
     coeff2[tid]=coeff2_in[tid];
     coeff3[tid]=coeff3_in[tid];
   }
-  
+
   acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@@ -204,7 +204,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
     virial[i]=(acctyp)0;
 
   __syncthreads();
-  
+
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
@@ -217,7 +217,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
@@ -229,18 +229,18 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
       if (rsq<cut_globalsq) {
         numtyp r, t, force;
 
         r = ucl_sqrt(rsq);
-        force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y, 
+        force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y,
                        coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
-      
-      	if (rsq>cut_innersq) {
-	        t = r - cut_inner;
-	        force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
-      	}
+
+              if (rsq>cut_innersq) {
+                t = r - cut_inner;
+                force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
+              }
 
         force *= (numtyp)-1.0*ucl_recip(r);
 
@@ -249,14 +249,14 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
         f.z+=delz*force;
 
         if (eflag>0) {
-          numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y, 
+          numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
                          coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
-       	  e += coeff3[mtype].z;
-      	  if (rsq > cut_innersq) {
-      	    e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
-      	  }
+                 e += coeff3[mtype].z;
+                if (rsq > cut_innersq) {
+                  e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
+                }
 
-          energy+=e; 
+          energy+=e;
         }
         if (vflag>0) {
           virial[0] += delx*delx*force;
diff --git a/lib/gpu/lal_zbl.h b/lib/gpu/lal_zbl.h
index 2996d90a5c..9885fcedf2 100644
--- a/lib/gpu/lal_zbl.h
+++ b/lib/gpu/lal_zbl.h
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndactrung@gmail.com
  ***************************************************************************/
 
@@ -24,27 +24,27 @@ template <class numtyp, class acctyp>
 class ZBL : public BaseAtomic<numtyp, acctyp> {
  public:
   ZBL();
-  ~ZBL(); 
+  ~ZBL();
 
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
-    * 
+    *
     * Returns:
     * -  0 if successfull
     * - -1 if fix gpu not found
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq, double **host_sw1, 
+  int init(const int ntypes, double **host_cutsq, double **host_sw1,
            double **host_sw2, double **host_sw3, double **host_sw4, double **host_sw5,
-           double **host_d1a, double **host_d2a, double **host_d3a, double **host_d4a, 
+           double **host_d1a, double **host_d2a, double **host_d3a, double **host_d4a,
            double **host_zze, double cut_globalsq, double cut_innersq, double cut_inner,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
            const double gpu_split, FILE *screen);
-  
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -70,8 +70,8 @@ class ZBL : public BaseAtomic<numtyp, acctyp> {
   double _cut_globalsq;
   double _cut_innersq;
   double _cut_inner;
- 
-  /// Number of atom types 
+
+  /// Number of atom types
   int _lj_types;
 
  private:
diff --git a/lib/gpu/lal_zbl_ext.cpp b/lib/gpu/lal_zbl_ext.cpp
index ddce858076..37aa74351b 100644
--- a/lib/gpu/lal_zbl_ext.cpp
+++ b/lib/gpu/lal_zbl_ext.cpp
@@ -9,7 +9,7 @@
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
-    begin                : 
+    begin                :
     email                : ndactrung@gmail.com
  ***************************************************************************/
 
@@ -27,11 +27,11 @@ static ZBL<PRECISION,ACC_PRECISION> ZBLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1, 
+int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1,
                  double **host_sw2, double **host_sw3, double **host_sw4, double **host_sw5,
-                 double **host_d1a, double **host_d2a, double **host_d3a, double **host_d4a, 
+                 double **host_d1a, double **host_d2a, double **host_d3a, double **host_d4a,
                  double **host_zze, double cut_globalsq, double cut_innersq, double cut_inner,
-                 const int inum, const int nall, const int max_nbors,  
+                 const int inum, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) {
   ZBLMF.clear();
   gpu_mode=ZBLMF.device->gpu_mode();
@@ -55,7 +55,7 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4, 
+    init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4,
                        host_sw5, host_d1a, host_d2a, host_d3a, host_d4a, host_zze,
                        cut_globalsq, cut_innersq, cut_inner,
                        inum, nall, 300, maxspecial, cell_size, gpu_split, screen);
@@ -74,13 +74,13 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4, 
+      init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4,
                          host_sw5, host_d1a, host_d2a, host_d3a, host_d4a, host_zze,
-                         cut_globalsq, cut_innersq, cut_inner, 
+                         cut_globalsq, cut_innersq, cut_inner,
                          inum, nall, 300, maxspecial, cell_size, gpu_split, screen);
 
     ZBLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
@@ -105,8 +105,8 @@ int ** zbl_gpu_compute_n(const int ago, const int inum_full,
   return ZBLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void zbl_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,