git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15248 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2016-07-01 23:27:26 +00:00 · 2016-07-01 23:27:26 +00:00 · 9656958169
parent 8366b35459
commit 9656958169
245 changed files with 4890 additions and 4832 deletions
--- a/lib/gpu/Makefile.lammps.mingw-cross
+++ b/lib/gpu/Makefile.lammps.mingw-cross
@ -1,6 +1,6 @@
 # Settings that the LAMMPS build will import when this package library is used
 # settings for OpenCL builds
 gpu_SYSINC =
-gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -lOpenCL
+gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -Wl,-Bdynamic,-lOpenCL,-Bstatic
 gpu_SYSPATH = 

--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@ -7,7 +7,7 @@

 EXTRAMAKE = Makefile.lammps.standard

-ifeq($(CUDA_HOME),)
+ifeq ($(CUDA_HOME),)
 CUDA_HOME = /usr/local/cuda
 endif

--- a/lib/gpu/Makefile.mingw32-cross
+++ b/lib/gpu/Makefile.mingw32-cross
@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
 OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
        -mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
        -I$(CUDA_HOME)/include
-OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL -L../../src/STUBS -lmpi_mingw32
+OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic -L../../src/STUBS -lmpi_mingw32
 OCL_PREC = -D_SINGLE_DOUBLE
 OCL_TUNE = -DFERMI_OCL
 EXTRAMAKE = Makefile.lammps.mingw-cross
--- a/lib/gpu/Makefile.mingw32-cross-mpi
+++ b/lib/gpu/Makefile.mingw32-cross-mpi
@ -4,7 +4,7 @@ OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
        -mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include \
 	-I../../tools/mingw-cross/mpich2-win32/include/ \
        -DMPICH_IGNORE_CXX_SEEK
-OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL \
+OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
 	-L../../tools/mingw-cross/mpich2-win32/lib -lmpi
 OCL_PREC = -D_SINGLE_DOUBLE
 OCL_TUNE = -DFERMI_OCL
--- a/lib/gpu/Makefile.mingw64-cross
+++ b/lib/gpu/Makefile.mingw64-cross
@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
 OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
 	-msse2 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
        -I$(CUDA_HOME)/include
-OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
+OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
 	-L../../src/STUBS -lmpi_mingw64
 OCL_PREC = -D_SINGLE_DOUBLE
 OCL_TUNE = -DFERMI_OCL
--- a/lib/gpu/Makefile.mingw64-cross-mpi
+++ b/lib/gpu/Makefile.mingw64-cross-mpi
@ -5,7 +5,7 @@ OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
 	-I../../tools/mingw-cross/mpich2-win64/include/ \
        -DMPICH_IGNORE_CXX_SEEK
 
-OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
+OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
 	-L../../tools/mingw-cross/mpich2-win64/lib -lmpi
 OCL_PREC = -D_SINGLE_DOUBLE
 OCL_TUNE = -DFERMI_OCL
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -35,7 +35,7 @@ namespace ucl_cudadr {
 // --------------------------------------------------------------------------
 // - COMMAND QUEUE STUFF
 // --------------------------------------------------------------------------
-typedef CUstream command_queue; 
+typedef CUstream command_queue;

 inline void ucl_sync(CUstream &stream) {
  CU_SAFE_CALL(cuStreamSynchronize(stream));
@ -59,21 +59,21 @@ struct NVDProperties {

 /// Class for looking at device properties
 /** \note Calls to change the device outside of the class results in incorrect
-  *       behavior 
+  *       behavior
  * \note There is no error checking for indexing past the number of devices **/
 class UCL_Device {
 public:
  /// Collect properties for every GPU on the node
  /** \note You must set the active GPU with set() before using the device **/
  inline UCL_Device();
-  
+
  inline ~UCL_Device();

  /// Returns 1 (For compatibility with OpenCL)
  inline int num_platforms() { return 1; }

  /// Return a string with name and info of the current platform
-  inline std::string platform_name() 
+  inline std::string platform_name()
    { return "NVIDIA Corporation NVIDIA CUDA Driver"; }

  /// Delete any contexts/data and set the platform number to be used
@ -97,24 +97,24 @@ class UCL_Device {

  /// Returns the default stream for the current device
  inline command_queue & cq() { return cq(0); }
-  
+
  /// Returns the stream indexed by i
  inline command_queue & cq(const int i) { return _cq[i]; }
-  
+
  /// Block until all commands in the default stream have completed
  inline void sync() { sync(0); }
-  
+
  /// Block until all commands in the specified stream have completed
  inline void sync(const int i) { ucl_sync(cq(i)); }
-  
+
  /// Get the number of command queues currently available on device
-  inline int num_queues() 
+  inline int num_queues()
    { return _cq.size(); }
-  
+
  /// Add a stream for device computations
  inline void push_command_queue() {
-    _cq.push_back(CUstream()); 
-    CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0)); 
+    _cq.push_back(CUstream());
+    CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0));
  }

  /// Remove a stream for device computations
@ -124,19 +124,19 @@ class UCL_Device {
    CU_SAFE_CALL_NS(cuStreamDestroy(_cq.back()));
    _cq.pop_back();
  }
-  
+
  /// Set the default command queue (by default this is the null stream)
-  /** \param i index of the command queue (as added by push_command_queue()) 
+  /** \param i index of the command queue (as added by push_command_queue())
      If i is 0, the default command queue is set to the null stream **/
  inline void set_command_queue(const int i) {
    if (i==0) _cq[0]=0;
    else _cq[0]=_cq[i];
  }
-  
+
  /// Get the current CUDA device name
  inline std::string name() { return name(_device); }
  /// Get the CUDA device name
-  inline std::string name(const int i) 
+  inline std::string name(const int i)
    { return std::string(_properties[i].name); }

  /// Get a string telling the type of the current device
@ -148,38 +148,38 @@ class UCL_Device {
  inline int device_type() { return device_type(_device); }
  /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
  inline int device_type(const int i) { return UCL_GPU; }
-  
+
  /// Returns true if host memory is efficiently addressable from device
  inline bool shared_memory() { return shared_memory(_device); }
  /// Returns true if host memory is efficiently addressable from device
  inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
-  
+
  /// Returns true if double precision is support for the current device
  inline bool double_precision() { return double_precision(_device); }
  /// Returns true if double precision is support for the device
  inline bool double_precision(const int i) {return arch(i)>=1.3;}
-  
+
  /// Get the number of compute units on the current device
  inline unsigned cus() { return cus(_device); }
  /// Get the number of compute units
-  inline unsigned cus(const int i) 
+  inline unsigned cus(const int i)
    { return _properties[i].multiProcessorCount; }

  /// Get the number of cores in the current device
  inline unsigned cores() { return cores(_device); }
  /// Get the number of cores
-  inline unsigned cores(const int i) 
-    { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; 
+  inline unsigned cores(const int i)
+    { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
      else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
      else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
      else return _properties[i].multiProcessorCount*192; }
-  
+
  /// Get the gigabytes of global memory in the current device
  inline double gigabytes() { return gigabytes(_device); }
  /// Get the gigabytes of global memory
-  inline double gigabytes(const int i) 
+  inline double gigabytes(const int i)
    { return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
-  
+
  /// Get the bytes of global memory in the current device
  inline size_t bytes() { return bytes(_device); }
  /// Get the bytes of global memory
@ -188,13 +188,13 @@ class UCL_Device {
  // Get the gigabytes of free memory in the current device
  inline double free_gigabytes() { return free_gigabytes(_device); }
  // Get the gigabytes of free memory
-  inline double free_gigabytes(const int i) 
+  inline double free_gigabytes(const int i)
    { return static_cast<double>(free_bytes(i))/1073741824; }
-  
+
  // Get the bytes of free memory in the current device
  inline size_t free_bytes() { return free_bytes(_device); }
  // Get the bytes of free memory
-  inline size_t free_bytes(const int i) { 
+  inline size_t free_bytes(const int i) {
    CUDA_INT_TYPE dfree, dtotal;
    CU_SAFE_CALL_NS(cuMemGetInfo(&dfree, &dtotal));
    return static_cast<size_t>(dfree);
@ -203,21 +203,21 @@ class UCL_Device {
  /// Return the GPGPU compute capability for current device
  inline double arch() { return arch(_device); }
  /// Return the GPGPU compute capability
-  inline double arch(const int i) 
+  inline double arch(const int i)
    { return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
-  
+
  /// Clock rate in GHz for current device
  inline double clock_rate() { return clock_rate(_device); }
  /// Clock rate in GHz
-  inline double clock_rate(const int i) 
+  inline double clock_rate(const int i)
    { return _properties[i].p.clockRate*1e-6;}
-               
+
  /// Get the maximum number of threads per block
  inline size_t group_size() { return group_size(_device); }
  /// Get the maximum number of threads per block
-  inline size_t group_size(const int i) 
+  inline size_t group_size(const int i)
    { return _properties[i].p.maxThreadsPerBlock; }
-  
+
  /// Return the maximum memory pitch in bytes for current device
  inline size_t max_pitch() { return max_pitch(_device); }
  /// Return the maximum memory pitch in bytes
@ -242,7 +242,7 @@ class UCL_Device {
    { return fission_by_counts(_device); }
  /// True if splitting device into subdevices by specified counts supported
  inline bool fission_by_counts(const int i)
-    { return false; }    
+    { return false; }
  /// True if splitting device into subdevices by affinity domains supported
  inline bool fission_by_affinity()
    { return fission_by_affinity(_device); }
@ -259,7 +259,7 @@ class UCL_Device {

  /// List all devices along with all properties
  inline void print_all(std::ostream &out);
- 
+
 private:
  int _device, _num_devices;
  std::vector<NVDProperties> _properties;
@ -279,16 +279,16 @@ UCL_Device::UCL_Device() {
    CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m));
    if (major==9999)
      continue;
-      
+
    _properties.push_back(NVDProperties());
    _properties.back().device_id=dev;
    _properties.back().major=major;
    _properties.back().minor=minor;
-    
+
    char namecstr[1024];
    CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
    _properties.back().name=namecstr;
-    
+
    CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
                                       CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
@ -296,23 +296,23 @@ UCL_Device::UCL_Device() {
    CU_SAFE_CALL_NS(cuDeviceGetProperties(&_properties.back().p,m));
    #if CUDA_VERSION >= 2020
    CU_SAFE_CALL_NS(cuDeviceGetAttribute(
-                      &_properties.back().kernelExecTimeoutEnabled, 
+                      &_properties.back().kernelExecTimeoutEnabled,
                      CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
    CU_SAFE_CALL_NS(cuDeviceGetAttribute(
                      &_properties.back().integrated,
                      CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
    CU_SAFE_CALL_NS(cuDeviceGetAttribute(
-                      &_properties.back().canMapHostMemory, 
+                      &_properties.back().canMapHostMemory,
                      CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
-    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().computeMode, 
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().computeMode,
                      CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev));
    #endif
    #if CUDA_VERSION >= 3010
    CU_SAFE_CALL_NS(cuDeviceGetAttribute(
-                      &_properties.back().concurrentKernels, 
+                      &_properties.back().concurrentKernels,
                      CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
    CU_SAFE_CALL_NS(cuDeviceGetAttribute(
-                      &_properties.back().ECCEnabled,  
+                      &_properties.back().ECCEnabled,
                      CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
    #endif
  }
@ -365,7 +365,7 @@ void UCL_Device::print_all(std::ostream &out) {
  cuDriverGetVersion(&driver_version);
  out << "CUDA Driver Version:                           "
      << driver_version/1000 << "." << driver_version%100
-		  << std::endl;
+                  << std::endl;
  #endif

  if (num_devices() == 0)
--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -35,15 +35,15 @@ template <class numtyp> class UCL_D_Mat;
 template <class hosttype, class devtype> class UCL_Vector;
 template <class hosttype, class devtype> class UCL_Matrix;
 #define UCL_MAX_KERNEL_ARGS 256
-    
+
 /// Class storing 1 or more kernel functions from a single string or file
 class UCL_Program {
 public:
  inline UCL_Program(UCL_Device &device) { _cq=device.cq(); }
-  inline UCL_Program(UCL_Device &device, const void *program, 
-                     const char *flags="", std::string *log=NULL) { 
+  inline UCL_Program(UCL_Device &device, const void *program,
+                     const char *flags="", std::string *log=NULL) {
    _cq=device.cq();
-    init(device); 
+    init(device);
    load_string(program,flags,log);
  }

@ -61,20 +61,20 @@ class UCL_Program {
                  std::string *log=NULL) {
    std::ifstream in(filename);
    if (!in || in.is_open()==false) {
-      #ifndef UCL_NO_EXIT 
-      std::cerr << "UCL Error: Could not open kernel file: " 
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not open kernel file: "
                << filename << std::endl;
      UCL_GERYON_EXIT;
      #endif
      return UCL_FILE_NOT_FOUND;
    }
-  
+
    std::string program((std::istreambuf_iterator<char>(in)),
                        std::istreambuf_iterator<char>());
    in.close();
    return load_string(program.c_str(),flags,log);
  }
-  
+
  /// Load a program from a string and compile with flags
  inline int load_string(const void *program, const char *flags="",
                         std::string *log=NULL) {
@ -94,12 +94,12 @@ class UCL_Program {

    CUresult err=cuModuleLoadDataEx(&_module,program,num_opts,
                                    options,(void **)values);
-                                        
+
    if (log!=NULL)
      *log=std::string(clog);
-      
+
    if (err != CUDA_SUCCESS) {
-      #ifndef UCL_NO_EXIT                                                 
+      #ifndef UCL_NO_EXIT
      std::cerr << std::endl
                << "----------------------------------------------------------\n"
                << " UCL Error: Error compiling PTX Program...\n"
@ -108,24 +108,24 @@ class UCL_Program {
      #endif
      return UCL_COMPILE_ERROR;
    }
-    
+
    return UCL_SUCCESS;
-  }                                      
-                              
+  }
+
  /// Load a precompiled program from a file
  inline int load_binary(const char *filename) {
    CUmodule _module;
    CUresult err = cuModuleLoad(&_module,filename);
    if (err==301) {
-      #ifndef UCL_NO_EXIT 
-      std::cerr << "UCL Error: Could not open binary kernel file: " 
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not open binary kernel file: "
                << filename << std::endl;
      UCL_GERYON_EXIT;
      #endif
      return UCL_FILE_NOT_FOUND;
    } else if (err!=CUDA_SUCCESS) {
-      #ifndef UCL_NO_EXIT 
-      std::cerr << "UCL Error: Error loading binary kernel file: " 
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Error loading binary kernel file: "
                << filename << std::endl;
      UCL_GERYON_EXIT;
      #endif
@ -138,7 +138,7 @@ class UCL_Program {
    //  return UCL_ERROR;
    return UCL_SUCCESS;
  }
-   
+
  friend class UCL_Kernel;
 private:
  CUmodule _module;
@ -149,23 +149,23 @@ class UCL_Program {
 /// Class for dealing with CUDA Driver kernels
 class UCL_Kernel {
 public:
-  UCL_Kernel() : _dimensions(1), _num_args(0) { 
+  UCL_Kernel() : _dimensions(1), _num_args(0) {
    #if CUDA_VERSION < 4000
    _param_size=0;
    #endif
-    _num_blocks[0]=0; 
+    _num_blocks[0]=0;
  }
-  
-  UCL_Kernel(UCL_Program &program, const char *function) : 
+
+  UCL_Kernel(UCL_Program &program, const char *function) :
    _dimensions(1), _num_args(0) {
    #if CUDA_VERSION < 4000
    _param_size=0;
    #endif
-    _num_blocks[0]=0; 
-    set_function(program,function); 
-    _cq=program._cq; 
+    _num_blocks[0]=0;
+    set_function(program,function);
+    _cq=program._cq;
  }
-  
+
  ~UCL_Kernel() {}

  /// Clear any function associated with the kernel
@ -189,7 +189,7 @@ class UCL_Kernel {

  /// Set the kernel argument.
  /** If not a device pointer, this must be repeated each time the argument
-    * changes 
+    * changes
    * \note To set kernel parameter i (i>0), parameter i-1 must be set **/
  template <class dtype>
  inline void set_arg(const unsigned index, const dtype * const arg) {
@ -202,27 +202,27 @@ class UCL_Kernel {
      CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
      #endif
    else
-      assert(0==1); // Must add kernel parameters in sequential order 
+      assert(0==1); // Must add kernel parameters in sequential order
  }
- 
+
  /// Set a geryon container as a kernel argument.
  template <class numtyp>
-  inline void set_arg(const UCL_D_Vec<numtyp> * const arg) 
+  inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
    { set_arg(&arg->begin()); }

  /// Set a geryon container as a kernel argument.
  template <class numtyp>
-  inline void set_arg(const UCL_D_Mat<numtyp> * const arg) 
+  inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
    { set_arg(&arg->begin()); }

  /// Set a geryon container as a kernel argument.
  template <class hosttype, class devtype>
-  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg) 
+  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
    { set_arg(&arg->device.begin()); }

  /// Set a geryon container as a kernel argument.
  template <class hosttype, class devtype>
-  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg) 
+  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
    { set_arg(&arg->device.begin()); }

  /// Add a kernel argument.
@ -257,37 +257,37 @@ class UCL_Kernel {

  /// Add a geryon container as a kernel argument.
  template <class numtyp>
-  inline void add_arg(const UCL_D_Vec<numtyp> * const arg) 
+  inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
    { add_arg(&arg->begin()); }

  /// Add a geryon container as a kernel argument.
  template <class numtyp>
-  inline void add_arg(const UCL_D_Mat<numtyp> * const arg) 
+  inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
    { add_arg(&arg->begin()); }

  /// Add a geryon container as a kernel argument.
  template <class hosttype, class devtype>
-  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg) 
+  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
    { add_arg(&arg->device.begin()); }

  /// Add a geryon container as a kernel argument.
  template <class hosttype, class devtype>
-  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg) 
+  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
    { add_arg(&arg->device.begin()); }

  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue is used for the kernel execution **/
-  inline void set_size(const size_t num_blocks, const size_t block_size) { 
-    _dimensions=1; 
-    _num_blocks[0]=num_blocks; 
+  inline void set_size(const size_t num_blocks, const size_t block_size) {
+    _dimensions=1;
+    _num_blocks[0]=num_blocks;
    _num_blocks[1]=1;
    _num_blocks[2]=1;
    #if CUDA_VERSION >= 4000
    _block_size[0]=block_size;
    _block_size[1]=1;
    _block_size[2]=1;
-    #else    
+    #else
    CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
    #endif
  }
@ -303,43 +303,43 @@ class UCL_Kernel {
  /** \note This should be called before any arguments have been added
      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
-                       const size_t block_size_x, const size_t block_size_y) { 
-    _dimensions=2; 
-    _num_blocks[0]=num_blocks_x; 
-    _num_blocks[1]=num_blocks_y; 
+                       const size_t block_size_x, const size_t block_size_y) {
+    _dimensions=2;
+    _num_blocks[0]=num_blocks_x;
+    _num_blocks[1]=num_blocks_y;
    _num_blocks[2]=1;
    #if CUDA_VERSION >= 4000
    _block_size[0]=block_size_x;
    _block_size[1]=block_size_y;
    _block_size[2]=1;
-    #else    
+    #else
    CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
    #endif
  }
-  
+
  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue for the kernel is changed to cq **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                       const size_t block_size_x, const size_t block_size_y,
-                       command_queue &cq) 
+                       command_queue &cq)
    {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}

  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
-                       const size_t block_size_x, 
+                       const size_t block_size_x,
                       const size_t block_size_y, const size_t block_size_z) {
-    _dimensions=2; 
-    _num_blocks[0]=num_blocks_x; 
-    _num_blocks[1]=num_blocks_y; 
-    _num_blocks[2]=1; 
+    _dimensions=2;
+    _num_blocks[0]=num_blocks_x;
+    _num_blocks[1]=num_blocks_y;
+    _num_blocks[2]=1;
    #if CUDA_VERSION >= 4000
    _block_size[0]=block_size_x;
    _block_size[1]=block_size_y;
    _block_size[2]=block_size_z;
-    #else    
+    #else
    CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
                                     block_size_z));
    #endif
@ -352,10 +352,10 @@ class UCL_Kernel {
                       const size_t block_size_x, const size_t block_size_y,
                       const size_t block_size_z, command_queue &cq) {
    _cq=cq;
-    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, 
+    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
             block_size_z);
  }
-  
+
  /// Run the kernel in the default command queue
  inline void run() {
    #if CUDA_VERSION >= 4000
@ -367,12 +367,12 @@ class UCL_Kernel {
    CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
    #endif
  }
-  
+
  /// Clear any arguments associated with the kernel
-  inline void clear_args() { 
-    _num_args=0; 
+  inline void clear_args() {
+    _num_args=0;
    #if CUDA_VERSION < 4000
-    _offsets.clear(); 
+    _offsets.clear();
    _param_size=0;
    #endif
  }
@ -390,7 +390,7 @@ class UCL_Kernel {
  unsigned _num_blocks[3];
  unsigned _num_args;
  friend class UCL_Texture;
-  
+
  #if CUDA_VERSION >= 4000
  unsigned _block_size[3];
  void * _kernel_args[UCL_MAX_KERNEL_ARGS];
--- a/lib/gpu/geryon/nvd_mat.h
+++ b/lib/gpu/geryon/nvd_mat.h
@ -17,12 +17,12 @@
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

 /*! \file */
-   
+
 #ifndef NVD_MAT_H
 #define NVD_MAT_H

@ -52,6 +52,6 @@ namespace ucl_cudadr {
 #include "ucl_print.h"
 #undef UCL_PRINT_ALLOW

-} // namespace ucl_cudadr 
+} // namespace ucl_cudadr

 #endif
--- a/lib/gpu/geryon/nvd_memory.h
+++ b/lib/gpu/geryon/nvd_memory.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -46,7 +46,7 @@ typedef CUdeviceptr device_ptr;
 // - HOST MEMORY ALLOCATION ROUTINES
 // --------------------------------------------------------------------------
 template <class mat_type, class copy_type>
-inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,  
+inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
  CUresult err=CUDA_SUCCESS;
  if (kind==UCL_NOT_PINNED)
@ -62,7 +62,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
 }

 template <class mat_type>
-inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,  
+inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
  CUresult err=CUDA_SUCCESS;
  if (kind==UCL_NOT_PINNED)
@ -95,7 +95,7 @@ inline int _host_resize(mat_type &mat, const size_t n) {
    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
  else if (mat.kind()==UCL_WRITE_ONLY)
    err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
-  else  
+  else
    err=cuMemAllocHost((void **)mat.host_ptr(),n);
  if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
    return UCL_MEMORY_ERROR;
@ -130,30 +130,30 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
                         const size_t cols, size_t &pitch,
                         const enum UCL_MEMOPT kind) {
  CUresult err;
-  CUDA_INT_TYPE upitch;                        
+  CUDA_INT_TYPE upitch;
  err=cuMemAllocPitch(&mat.cbegin(),&upitch,
                      cols*sizeof(typename mat_type::data_type),rows,16);
-  pitch=static_cast<size_t>(upitch);                               
+  pitch=static_cast<size_t>(upitch);
  if (err!=CUDA_SUCCESS)
    return UCL_MEMORY_ERROR;
  mat.cq()=cm.cq();
  return UCL_SUCCESS;
-}    
+}

 template <class mat_type, class copy_type>
 inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
                         const size_t cols, size_t &pitch,
                         const enum UCL_MEMOPT kind) {
  CUresult err;
-  unsigned upitch;                        
+  unsigned upitch;
  err=cuMemAllocPitch(&mat.cbegin(),&upitch,
                      cols*sizeof(typename mat_type::data_type),rows,16);
-  pitch=static_cast<size_t>(upitch);                               
+  pitch=static_cast<size_t>(upitch);
  if (err!=CUDA_SUCCESS)
    return UCL_MEMORY_ERROR;
  mat.cq()=d.cq();
  return UCL_SUCCESS;
-}    
+}

 template <class mat_type>
 inline void _device_free(mat_type &mat) {
@ -175,33 +175,33 @@ inline int _device_resize(mat_type &mat, const size_t rows,
                          const size_t cols, size_t &pitch) {
  _device_free(mat);
  CUresult err;
-  CUDA_INT_TYPE upitch;                        
+  CUDA_INT_TYPE upitch;
  err=cuMemAllocPitch(&mat.cbegin(),&upitch,
                      cols*sizeof(typename mat_type::data_type),rows,16);
-  pitch=static_cast<size_t>(upitch);                               
+  pitch=static_cast<size_t>(upitch);
  if (err!=CUDA_SUCCESS)
    return UCL_MEMORY_ERROR;
  return UCL_SUCCESS;
-}    
+}

-inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) { 
+inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) {
  *ptr=in;
 }

 template <class numtyp>
-inline void _device_view(CUdeviceptr *ptr, numtyp *in) { 
-  *ptr=0; 
+inline void _device_view(CUdeviceptr *ptr, numtyp *in) {
+  *ptr=0;
 }

-inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in, 
-                         const size_t offset, const size_t numsize) { 
+inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in,
+                         const size_t offset, const size_t numsize) {
  *ptr=in+offset*numsize;
 }

 template <class numtyp>
 inline void _device_view(CUdeviceptr *ptr, numtyp *in,
-                         const size_t offset, const size_t numsize) { 
-  *ptr=0; 
+                         const size_t offset, const size_t numsize) {
+  *ptr=0;
 }

 // --------------------------------------------------------------------------
@ -211,13 +211,13 @@ template <class mat_type, class copy_type>
 inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows,
                                const size_t cols) {
  assert(0==1);
-}    
+}

 template <class mat_type, class copy_type>
 inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
                                const size_t cols) {
  assert(0==1);
-}    
+}

 template <class mat_type>
 inline void _device_image_free(mat_type &mat) {
@ -245,7 +245,7 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
 // - HELPER FUNCTIONS FOR MEMCPY ROUTINES
 // --------------------------------------------------------------------------

-inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch, 
+inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch,
                            const size_t spitch, const size_t cols,
                            const size_t rows) {
  ins.srcXInBytes=0;
@ -257,13 +257,13 @@ inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch,
  ins.WidthInBytes=cols;
  ins.Height=rows;
 }
-                            
+
 template <int mem> struct _nvd_set_2D_mem;
-template <> struct _nvd_set_2D_mem<1> 
+template <> struct _nvd_set_2D_mem<1>
  { static CUmemorytype a() { return CU_MEMORYTYPE_HOST; } };
-template <> struct _nvd_set_2D_mem<2> 
+template <> struct _nvd_set_2D_mem<2>
  { static CUmemorytype a() { return CU_MEMORYTYPE_ARRAY; } };
-template <int mem> struct _nvd_set_2D_mem 
+template <int mem> struct _nvd_set_2D_mem
  { static CUmemorytype a() { return CU_MEMORYTYPE_DEVICE; } };


@ -285,7 +285,7 @@ template<> struct _ucl_memcpy<2,2> {
    assert(0==1);
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows) {
    CUDA_MEMCPY2D ins;
@ -297,7 +297,7 @@ template<> struct _ucl_memcpy<2,2> {
    CU_SAFE_CALL(cuMemcpy2D(&ins));
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows, CUstream &cq) {
    CUDA_MEMCPY2D ins;
@ -322,7 +322,7 @@ template<> struct _ucl_memcpy<2,0> {
    assert(0==1);
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows) {
    CUDA_MEMCPY2D ins;
@ -334,7 +334,7 @@ template<> struct _ucl_memcpy<2,0> {
    CU_SAFE_CALL(cuMemcpy2D(&ins));
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows, CUstream &cq) {
    CUDA_MEMCPY2D ins;
@ -359,7 +359,7 @@ template<> struct _ucl_memcpy<2,1> {
    assert(0==1);
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows) {
    CUDA_MEMCPY2D ins;
@ -371,7 +371,7 @@ template<> struct _ucl_memcpy<2,1> {
    CU_SAFE_CALL(cuMemcpy2D(&ins));
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows, CUstream &cq) {
    CUDA_MEMCPY2D ins;
@ -396,7 +396,7 @@ template<> struct _ucl_memcpy<0,2> {
    assert(0==1);
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows) {
    CUDA_MEMCPY2D ins;
@ -408,7 +408,7 @@ template<> struct _ucl_memcpy<0,2> {
    CU_SAFE_CALL(cuMemcpy2D(&ins));
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows, CUstream &cq) {
    CUDA_MEMCPY2D ins;
@ -433,7 +433,7 @@ template<> struct _ucl_memcpy<1,2> {
    assert(0==1);
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows) {
    CUDA_MEMCPY2D ins;
@ -445,7 +445,7 @@ template<> struct _ucl_memcpy<1,2> {
    CU_SAFE_CALL(cuMemcpy2D(&ins));
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows, CUstream &cq) {
    CUDA_MEMCPY2D ins;
@ -470,7 +470,7 @@ template <> struct _ucl_memcpy<1,0> {
    CU_SAFE_CALL(cuMemcpyDtoHAsync(dst.begin(),src.cbegin(),n,cq));
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows) {
    CUDA_MEMCPY2D ins;
@ -482,7 +482,7 @@ template <> struct _ucl_memcpy<1,0> {
    CU_SAFE_CALL(cuMemcpy2D(&ins));
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows, CUstream &cq) {
    CUDA_MEMCPY2D ins;
@ -507,7 +507,7 @@ template <> struct _ucl_memcpy<0,1> {
    CU_SAFE_CALL(cuMemcpyHtoDAsync(dst.cbegin(),src.begin(),n,cq));
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows) {
    CUDA_MEMCPY2D ins;
@ -519,7 +519,7 @@ template <> struct _ucl_memcpy<0,1> {
    CU_SAFE_CALL(cuMemcpy2D(&ins));
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows, CUstream &cq) {
    CUDA_MEMCPY2D ins;
@ -542,7 +542,7 @@ template <> struct _ucl_memcpy<1,1> {
                        CUstream &cq)
    { memcpy(dst.begin(),src.begin(),n); }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows) {
    CUDA_MEMCPY2D ins;
@ -554,7 +554,7 @@ template <> struct _ucl_memcpy<1,1> {
    CU_SAFE_CALL(cuMemcpy2D(&ins));
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows, CUstream &cq) {
    CUDA_MEMCPY2D ins;
@ -579,18 +579,18 @@ template <int mem1, int mem2> struct _ucl_memcpy {
    CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin(),src.cbegin(),n,cq));
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows) {
    if (p1::PADDED==0 || p2::PADDED==0) {
      size_t src_offset=0, dst_offset=0;
-      for (size_t i=0; i<rows; i++) {                       
+      for (size_t i=0; i<rows; i++) {
        CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset,
                                  src.cbegin()+src_offset,cols));
        src_offset+=spitch;
        dst_offset+=dpitch;
      }
-    } else {                                       
+    } else {
      CUDA_MEMCPY2D ins;
      _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
      ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
@ -601,12 +601,12 @@ template <int mem1, int mem2> struct _ucl_memcpy {
    }
  }
  template <class p1, class p2>
-      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                            const size_t spitch, const size_t cols,
                            const size_t rows, CUstream &cq) {
    if (p1::PADDED==0 || p2::PADDED==0) {
      size_t src_offset=0, dst_offset=0;
-      for (size_t i=0; i<rows; i++) {                       
+      for (size_t i=0; i<rows; i++) {
        CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin()+dst_offset,
                                       src.cbegin()+src_offset,cols,cq));
        src_offset+=spitch;
@ -636,22 +636,22 @@ inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n,
 }

 template<class mat1, class mat2>
-inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, 
-                       const size_t spitch, const size_t cols, 
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+                       const size_t spitch, const size_t cols,
                       const size_t rows) {
  _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
                                                 rows);
 }

 template<class mat1, class mat2>
-inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, 
-                       const size_t spitch, const size_t cols, 
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+                       const size_t spitch, const size_t cols,
                       const size_t rows,CUstream &cq) {
  _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
                                                 rows,cq);
 }

-} // namespace ucl_cudart 
+} // namespace ucl_cudart

 #endif

--- a/lib/gpu/geryon/nvd_texture.h
+++ b/lib/gpu/geryon/nvd_texture.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -28,7 +28,7 @@
 #include "nvd_mat.h"

 namespace ucl_cudadr {
-    
+
 /// Class storing a texture reference
 class UCL_Texture {
 public:
@ -38,39 +38,39 @@ class UCL_Texture {
  inline UCL_Texture(UCL_Program &prog, const char *texture_name)
    { get_texture(prog,texture_name); }
  /// Set the texture reference for this object
-  inline void get_texture(UCL_Program &prog, const char *texture_name)  
+  inline void get_texture(UCL_Program &prog, const char *texture_name)
    { CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); }

  /// Bind a float array where each fetch grabs a vector of length numel
  template<class numtyp>
-  inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel) 
+  inline void bind_float(UCL_D_Vec<numtyp> &vec, const unsigned numel)
    { _bind_float(vec,numel); }

  /// Bind a float array where each fetch grabs a vector of length numel
  template<class numtyp>
-  inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel) 
+  inline void bind_float(UCL_D_Mat<numtyp> &vec, const unsigned numel)
    { _bind_float(vec,numel); }

  /// Bind a float array where each fetch grabs a vector of length numel
  template<class numtyp, class devtyp>
-  inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel) 
+  inline void bind_float(UCL_Vector<numtyp, devtyp> &vec, const unsigned numel)
    { _bind_float(vec.device,numel); }

  /// Bind a float array where each fetch grabs a vector of length numel
  template<class numtyp, class devtyp>
-  inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel) 
+  inline void bind_float(UCL_Matrix<numtyp, devtyp> &vec, const unsigned numel)
    { _bind_float(vec.device,numel); }

  /// Unbind the texture reference from the memory allocation
  inline void unbind() { }

-  /// Make a texture reference available to kernel  
-  inline void allow(UCL_Kernel &kernel) { 
+  /// Make a texture reference available to kernel
+  inline void allow(UCL_Kernel &kernel) {
    #if CUDA_VERSION < 4000
-    CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex)); 
+    CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
    #endif
  }
-  
+
 private:
  CUtexref _tex;
  friend class UCL_Kernel;
@ -80,7 +80,7 @@ class UCL_Texture {
    #ifdef UCL_DEBUG
    assert(numel!=0 && numel<5);
    #endif
-    CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(), 
+    CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(),
                 vec.numel()*vec.element_size()));
    if (vec.element_size()==sizeof(float))
      CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel));
--- a/lib/gpu/geryon/nvd_timer.h
+++ b/lib/gpu/geryon/nvd_timer.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -41,7 +41,7 @@ class UCL_Timer {
  /// Clear any data associated with timer
  /** \note init() must be called to reuse timer after a clear() **/
  inline void clear() {
-    if (_initialized) { 
+    if (_initialized) {
      CU_DESTRUCT_CALL(cuEventDestroy(start_event));
      CU_DESTRUCT_CALL(cuEventDestroy(stop_event));
      _initialized=false;
@ -63,16 +63,16 @@ class UCL_Timer {

  /// Start timing on command queue
  inline void start() { CU_SAFE_CALL(cuEventRecord(start_event,_cq)); }
-  
+
  /// Stop timing on command queue
  inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
-  
+
  /// Block until the start event has been reached on device
-  inline void sync_start() 
+  inline void sync_start()
    { CU_SAFE_CALL(cuEventSynchronize(start_event)); }

  /// Block until the stop event has been reached on device
-  inline void sync_stop() 
+  inline void sync_stop()
    { CU_SAFE_CALL(cuEventSynchronize(stop_event)); }

  /// Set the time elapsed to zero (not the total_time)
@ -80,29 +80,29 @@ class UCL_Timer {
    CU_SAFE_CALL(cuEventRecord(start_event,_cq));
    CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
  }
-  
+
  /// Set the total time to zero
  inline void zero_total() { _total_time=0.0; }
-  
+
  /// Add time from previous start and stop to total
  /** Forces synchronization **/
-  inline double add_to_total() 
+  inline double add_to_total()
    { double t=time(); _total_time+=t; return t/1000.0; }
-  
+
  /// Add a user specified time to the total (ms)
  inline void add_time_to_total(const double t) { _total_time+=t; }
-  
+
  /// Return the time (ms) of last start to stop - Forces synchronization
-  inline double time() { 
+  inline double time() {
    float timer;
    CU_SAFE_CALL(cuEventSynchronize(stop_event));
    CU_SAFE_CALL( cuEventElapsedTime(&timer,start_event,stop_event) );
-    return timer; 
+    return timer;
  }
-  
+
  /// Return the time (s) of last start to stop - Forces synchronization
  inline double seconds() { return time()/1000.0; }
-  
+
  /// Return the total time in ms
  inline double total_time() { return _total_time; }

--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -40,13 +40,13 @@
 #include "ucl_types.h"

 namespace ucl_opencl {
-    
+
 // --------------------------------------------------------------------------
 // - COMMAND QUEUE STUFF
 // --------------------------------------------------------------------------
-typedef cl_command_queue command_queue; 
+typedef cl_command_queue command_queue;
 typedef cl_context context_type;
-  
+
 inline void ucl_sync(cl_command_queue &cq) {
  CL_SAFE_CALL(clFinish(cq));
 }
@ -76,19 +76,19 @@ struct OCLProperties {

 /// Class for looking at data parallel device properties
 /** \note Calls to change the device outside of the class results in incorrect
-  *       behavior 
+  *       behavior
  * \note There is no error checking for indexing past the number of devices **/
 class UCL_Device {
 public:
  /// Collect properties for every device on the node
   /** \note You must set the active GPU with set() before using the device **/
  inline UCL_Device();
-  
+
  inline ~UCL_Device();

  /// Return the number of platforms (0 if error or no platforms)
  inline int num_platforms() { return _num_platforms; }
-  
+
  /// Return a string with name and info of the current platform
  inline std::string platform_name();

@ -104,38 +104,38 @@ class UCL_Device {
    * be allocated for use. clear() is called to delete any contexts and
    * associated data from previous calls to set(). **/
  inline int set(int num);
-  
+
  /// Delete any context and associated data stored from a call to set()
  inline void clear();

  /// Get the current device number
  inline int device_num() { return _device; }
-  
+
  /// Returns the context for the current device
  inline cl_context & context() { return _context; }
-  
+
  /// Returns the default stream for the current device
  inline command_queue & cq() { return cq(_default_cq); }
-  
+
  /// Returns the stream indexed by i
  inline command_queue & cq(const int i) { return _cq[i]; }
-  
+
  /// Set the default command queue
-  /** \param i index of the command queue (as added by push_command_queue()) 
+  /** \param i index of the command queue (as added by push_command_queue())
      If i is 0, the command queue created with device initialization is
      used **/
  inline void set_command_queue(const int i) { _default_cq=i; }
-  
+
  /// Block until all commands in the default stream have completed
  inline void sync() { sync(_default_cq); }
-  
+
  /// Block until all commands in the specified stream have completed
  inline void sync(const int i) { ucl_sync(cq(i)); }
-  
+
  /// Get the number of command queues currently available on device
-  inline int num_queues() 
+  inline int num_queues()
    { return _cq.size(); }
-  
+
  /// Add a command queue for device computations (with profiling enabled)
  inline void push_command_queue() {
    cl_int errorv;
@ -143,7 +143,7 @@ class UCL_Device {
    _cq.back()=clCreateCommandQueue(_context,_cl_device,
                                    CL_QUEUE_PROFILING_ENABLE,&errorv);
    if (errorv!=CL_SUCCESS) {
-      std::cerr << "Could not create command queue on device: " << name() 
+      std::cerr << "Could not create command queue on device: " << name()
                << std::endl;
      UCL_GERYON_EXIT;
    }
@ -160,76 +160,76 @@ class UCL_Device {
  /// Get the current OpenCL device name
  inline std::string name() { return name(_device); }
  /// Get the OpenCL device name
-  inline std::string name(const int i) 
+  inline std::string name(const int i)
    { return std::string(_properties[i].name); }

  /// Get a string telling the type of the current device
  inline std::string device_type_name() { return device_type_name(_device); }
  /// Get a string telling the type of the device
  inline std::string device_type_name(const int i);
-  
+
  /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
  inline int device_type() { return device_type(_device); }
  /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
  inline int device_type(const int i);
-  
+
  /// Returns true if host memory is efficiently addressable from device
  inline bool shared_memory() { return shared_memory(_device); }
  /// Returns true if host memory is efficiently addressable from device
-  inline bool shared_memory(const int i) 
+  inline bool shared_memory(const int i)
    { return _shared_mem_device(_properties[i].device_type); }
-  
+
  /// Returns true if double precision is support for the current device
  inline bool double_precision() { return double_precision(_device); }
  /// Returns true if double precision is support for the device
-  inline bool double_precision(const int i) 
+  inline bool double_precision(const int i)
    {return _properties[i].double_precision;}
-   
+
  /// Get the number of compute units on the current device
  inline unsigned cus() { return cus(_device); }
  /// Get the number of compute units
-  inline unsigned cus(const int i) 
+  inline unsigned cus(const int i)
    { return _properties[i].compute_units; }

  /// Get the gigabytes of global memory in the current device
  inline double gigabytes() { return gigabytes(_device); }
  /// Get the gigabytes of global memory
-  inline double gigabytes(const int i) 
+  inline double gigabytes(const int i)
    { return static_cast<double>(_properties[i].global_mem)/1073741824; }

  /// Get the bytes of global memory in the current device
  inline size_t bytes() { return bytes(_device); }
  /// Get the bytes of global memory
  inline size_t bytes(const int i) { return _properties[i].global_mem; }
-  
+
  /// Return the GPGPU revision number for current device
  //inline double revision() { return revision(_device); }
  /// Return the GPGPU revision number
-  //inline double revision(const int i) 
+  //inline double revision(const int i)
  //  { return //static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
-  
+
  /// Clock rate in GHz for current device
  inline double clock_rate() { return clock_rate(_device); }
  /// Clock rate in GHz
  inline double clock_rate(const int i) { return _properties[i].clock*1e-3;}
-  
+
  /// Return the address alignment in bytes
  inline int alignment() { return alignment(_device); }
  /// Return the address alignment in bytes
  inline int alignment(const int i) { return _properties[i].alignment; }
-               
+
  /// Return the timer resolution
  inline size_t timer_resolution() { return timer_resolution(_device); }
  /// Return the timer resolution
-  inline size_t timer_resolution(const int i) 
+  inline size_t timer_resolution(const int i)
    { return _properties[i].timer_resolution; }
-    
+
  /// Get the maximum number of threads per block
  inline size_t group_size() { return group_size(_device); }
  /// Get the maximum number of threads per block
-  inline size_t group_size(const int i) 
+  inline size_t group_size(const int i)
    { return _properties[i].work_group_size; }
-  
+
  /// Return the maximum memory pitch in bytes for current device
  inline size_t max_pitch() { return max_pitch(_device); }
  /// Return the maximum memory pitch in bytes
@ -254,7 +254,7 @@ class UCL_Device {
    { return fission_by_counts(_device); }
  /// True if splitting device into subdevices by specified counts supported
  inline bool fission_by_counts(const int i)
-    { return _properties[i].partition_counts; }    
+    { return _properties[i].partition_counts; }
  /// True if splitting device into subdevices by affinity domains supported
  inline bool fission_by_affinity()
    { return fission_by_affinity(_device); }
@ -271,10 +271,10 @@ class UCL_Device {

  /// List all devices along with all properties
  inline void print_all(std::ostream &out);
-  
+
  /// Return the OpenCL type for the device
  inline cl_device_id & cl_device() { return _cl_device; }
- 
+
 private:
  int _num_platforms;          // Number of platforms
  int _platform;               // UCL_Device ID for current platform
@ -287,7 +287,7 @@ class UCL_Device {
  std::vector<cl_device_id> _cl_devices;  // OpenCL IDs for all devices
  int _num_devices;                       // Number of devices
  std::vector<OCLProperties> _properties; // Properties for each device
-  
+
  inline void add_properties(cl_device_id);
  inline int create_context();
  int _default_cq;
@ -300,7 +300,7 @@ UCL_Device::UCL_Device() {
  // --- Get Number of Platforms
  cl_uint nplatforms;
  cl_int errorv=clGetPlatformIDs(20,_cl_platforms,&nplatforms);
-  
+
  if (errorv!=CL_SUCCESS) {
    _num_platforms=0;
    return;
@ -328,18 +328,18 @@ void UCL_Device::clear() {
 int UCL_Device::set_platform(int pid) {
  clear();
  cl_int errorv;
-  
+
  _cl_device=0;
  _device=-1;
  _num_devices=0;
  _default_cq=0;
- 
+
  #ifdef UCL_DEBUG
  assert(pid<num_platforms());
  #endif
  _platform=pid;
  _cl_platform=_cl_platforms[_platform];
-  
+
  // --- Get Number of Devices
  cl_uint n;
  errorv=clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,0,NULL,&n);
@ -351,7 +351,7 @@ int UCL_Device::set_platform(int pid) {
  cl_device_id device_list[_num_devices];
  CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
                              &n));
-  
+
  // --- Store properties for each device
  for (int i=0; i<_num_devices; i++) {
    _cl_devices.push_back(device_list[i]);
@ -385,7 +385,7 @@ void UCL_Device::add_properties(cl_device_id device_list) {
  OCLProperties op;
  char buffer[1024];
  cl_bool ans_bool;
-    
+
  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL));
  op.name=buffer;
  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_GLOBAL_MEM_SIZE,
@ -409,8 +409,8 @@ void UCL_Device::add_properties(cl_device_id device_list) {
                               NULL));
  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MEM_BASE_ADDR_ALIGN,
                               sizeof(cl_uint),&op.alignment,NULL));
-  op.alignment/=8;                               
-  
+  op.alignment/=8;
+
  // Determine if double precision is supported
  cl_uint double_width;
  CL_SAFE_CALL(clGetDeviceInfo(device_list,
@ -420,11 +420,11 @@ void UCL_Device::add_properties(cl_device_id device_list) {
    op.double_precision=false;
  else
    op.double_precision=true;
-  
+
  CL_SAFE_CALL(clGetDeviceInfo(device_list,
                               CL_DEVICE_PROFILING_TIMER_RESOLUTION,
                               sizeof(size_t),&op.timer_resolution,NULL));
-  
+

  op.ecc_support=false;
  CL_SAFE_CALL(clGetDeviceInfo(device_list,
@ -432,7 +432,7 @@ void UCL_Device::add_properties(cl_device_id device_list) {
                               sizeof(ans_bool),&ans_bool,NULL));
  if (ans_bool==CL_TRUE)
    op.ecc_support=true;
-  
+
  op.c_version="";
  op.partition_equal=false;
  op.partition_counts=false;
@ -458,30 +458,30 @@ void UCL_Device::add_properties(cl_device_id device_list) {
    else if (pinfo[i]==CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN)
      op.partition_affinity=true;
  }
-  
+
  CL_SAFE_CALL(clGetDeviceInfo(device_list,
                               CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
                               sizeof(cl_uint),&op.max_sub_devices,NULL));
  #endif
-  
+
  _properties.push_back(op);
 }

 std::string UCL_Device::platform_name() {
  char info[1024];
-  
+
  CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info,
                                 NULL));
  std::string ans=std::string(info)+' ';
-  
+
  CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_NAME,1024,info,
                                 NULL));
  ans+=std::string(info)+' ';
-  
+
  CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VERSION,1024,info,
               NULL));
  ans+=std::string(info);
-  
+
  return ans;
 }

@ -512,7 +512,7 @@ int UCL_Device::device_type(const int i) {
 // Set the CUDA device to the specified device number
 int UCL_Device::set(int num) {
  clear();
-  
+
  cl_device_id device_list[_num_devices];
  cl_uint n;
  CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
@ -557,7 +557,7 @@ void UCL_Device::print_all(std::ostream &out) {
        << _properties[i].work_item_size[1] << " x "
        << _properties[i].work_item_size[2] << std::endl;
    //out << "  Maximum sizes of each dimension of a grid:     "
-    //    << _properties[i].maxGridSize[0] << " x " 
+    //    << _properties[i].maxGridSize[0] << " x "
    //    << _properties[i].maxGridSize[1] << " x "
    //    << _properties[i].maxGridSize[2] << std::endl;
    //out << "  Maximum memory pitch:                          "
--- a/lib/gpu/geryon/ocl_kernel.h
+++ b/lib/gpu/geryon/ocl_kernel.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -28,7 +28,7 @@
 #include <fstream>

 namespace ucl_opencl {
-    
+
 class UCL_Texture;
 template <class numtyp> class UCL_D_Vec;
 template <class numtyp> class UCL_D_Mat;
@ -41,10 +41,10 @@ class UCL_Program {
 public:
  inline UCL_Program() : _init_done(false) {}
  inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); }
-  inline UCL_Program(UCL_Device &device, const void *program, 
-                     const char *flags="", std::string *log=NULL) : 
-      _init_done(false) { 
-    init(device); 
+  inline UCL_Program(UCL_Device &device, const void *program,
+                     const char *flags="", std::string *log=NULL) :
+      _init_done(false) {
+    init(device);
    load_string(program,flags,log);
  }

@ -56,7 +56,7 @@ class UCL_Program {
    _device=device.cl_device();
    _context=device.context();
    _cq=device.cq();
-    CL_SAFE_CALL(clRetainContext(_context)); 
+    CL_SAFE_CALL(clRetainContext(_context));
    CL_SAFE_CALL(clRetainCommandQueue(_cq));
    _init_done=true;
  }
@ -65,7 +65,7 @@ class UCL_Program {
  /** \note Must call init() after each clear **/
  inline void clear() {
    if (_init_done) {
-      CL_DESTRUCT_CALL(clReleaseProgram(_program)); 
+      CL_DESTRUCT_CALL(clReleaseProgram(_program));
      CL_DESTRUCT_CALL(clReleaseContext(_context));
      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
      _init_done=false;
@ -77,20 +77,20 @@ class UCL_Program {
                  std::string *log=NULL) {
    std::ifstream in(filename);
    if (!in || in.is_open()==false) {
-      #ifndef UCL_NO_EXIT 
-      std::cerr << "UCL Error: Could not open kernel file: " 
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not open kernel file: "
                << filename << std::endl;
      UCL_GERYON_EXIT;
      #endif
      return UCL_FILE_NOT_FOUND;
    }
-  
+
    std::string program((std::istreambuf_iterator<char>(in)),
                        std::istreambuf_iterator<char>());
    in.close();
    return load_string(program.c_str(),flags,log);
  }
-  
+
  /// Load a program from a string and compile with flags
  inline int load_string(const void *program, const char *flags="",
                         std::string *log=NULL) {
@ -103,23 +103,23 @@ class UCL_Program {
      CL_CHECK_ERR(error_flag);
    cl_build_status build_status;
    CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,
-                                       CL_PROGRAM_BUILD_STATUS, 
+                                       CL_PROGRAM_BUILD_STATUS,
                                       sizeof(cl_build_status),&build_status,
                                       NULL));
-                                       
+
    if (build_status != CL_SUCCESS || log!=NULL) {
      size_t ms;
-      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0, 
+      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0,
                                         NULL, &ms));
-      char build_log[ms];                                     
+      char build_log[ms];
      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms,
                                         build_log, NULL));
-                                         
+
      if (log!=NULL)
        *log=std::string(build_log);
-                                                 
+
      if (build_status != CL_SUCCESS) {
-        #ifndef UCL_NO_EXIT                                                 
+        #ifndef UCL_NO_EXIT
        std::cerr << std::endl
                  << "----------------------------------------------------------\n"
                  << " UCL Error: Error compiling OpenCL Program ("
@ -130,10 +130,10 @@ class UCL_Program {
        return UCL_COMPILE_ERROR;
      }
    }
-    
+
    return UCL_SUCCESS;
  }
-   
+
  /// Return the default command queue/stream associated with this data
  inline command_queue & cq() { return _cq; }
  /// Change the default command queue associated with matrix
@ -143,7 +143,7 @@ class UCL_Program {
 private:
  bool _init_done;
  cl_program _program;
-  cl_device_id _device; 
+  cl_device_id _device;
  cl_context _context;
  cl_command_queue _cq;
 };
@ -153,7 +153,7 @@ class UCL_Kernel {
 public:
  UCL_Kernel() : _dimensions(1), _function_set(false), _num_args(0)
    {  _block_size[0]=0; _num_blocks[0]=0; }
-  
+
  inline UCL_Kernel(UCL_Program &program, const char *function) :
    _dimensions(1), _function_set(false), _num_args(0)
    {  _block_size[0]=0; _num_blocks[0]=0; set_function(program,function); }
@ -178,48 +178,48 @@ class UCL_Kernel {
  /** If not a device pointer, this must be repeated each time the argument
    * changes **/
  template <class dtype>
-  inline void set_arg(const cl_uint index, const dtype * const arg) { 
-    CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg)); 
+  inline void set_arg(const cl_uint index, const dtype * const arg) {
+    CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg));
    if (index>_num_args) {
      _num_args=index;
      #ifdef UCL_DEBUG
      if (_num_args>_kernel_info_nargs) {
-        std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: " 
+        std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
                  << _kernel_info_name << std::endl;
        assert(0==1);
      }
      #endif
    }
  }
- 
+
  /// Set a geryon container as a kernel argument.
  template <class numtyp>
-  inline void set_arg(const UCL_D_Vec<numtyp> * const arg) 
+  inline void set_arg(const UCL_D_Vec<numtyp> * const arg)
    { set_arg(&arg->begin()); }

  /// Set a geryon container as a kernel argument.
  template <class numtyp>
-  inline void set_arg(const UCL_D_Mat<numtyp> * const arg) 
+  inline void set_arg(const UCL_D_Mat<numtyp> * const arg)
    { set_arg(&arg->begin()); }

  /// Set a geryon container as a kernel argument.
  template <class hosttype, class devtype>
-  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg) 
+  inline void set_arg(const UCL_Vector<hosttype, devtype> * const arg)
    { set_arg(&arg->device.begin()); }

  /// Set a geryon container as a kernel argument.
  template <class hosttype, class devtype>
-  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg) 
+  inline void set_arg(const UCL_Matrix<hosttype, devtype> * const arg)
    { set_arg(&arg->device.begin()); }

  /// Add a kernel argument.
  template <class dtype>
  inline void add_arg(const dtype * const arg) {
-    CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg)); 
-    _num_args++; 
+    CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg));
+    _num_args++;
    #ifdef UCL_DEBUG
    if (_num_args>_kernel_info_nargs) {
-      std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: " 
+      std::cerr << "TOO MANY ARGUMENTS TO OPENCL FUNCTION: "
                << _kernel_info_name << std::endl;
      assert(0==1);
    }
@ -228,31 +228,31 @@ class UCL_Kernel {

  /// Add a geryon container as a kernel argument.
  template <class numtyp>
-  inline void add_arg(const UCL_D_Vec<numtyp> * const arg) 
+  inline void add_arg(const UCL_D_Vec<numtyp> * const arg)
    { add_arg(&arg->begin()); }

  /// Add a geryon container as a kernel argument.
  template <class numtyp>
-  inline void add_arg(const UCL_D_Mat<numtyp> * const arg) 
+  inline void add_arg(const UCL_D_Mat<numtyp> * const arg)
    { add_arg(&arg->begin()); }

  /// Add a geryon container as a kernel argument.
  template <class hosttype, class devtype>
-  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg) 
+  inline void add_arg(const UCL_Vector<hosttype, devtype> * const arg)
    { add_arg(&arg->device.begin()); }

  /// Add a geryon container as a kernel argument.
  template <class hosttype, class devtype>
-  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg) 
+  inline void add_arg(const UCL_Matrix<hosttype, devtype> * const arg)
    { add_arg(&arg->device.begin()); }

  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue is used for the kernel execution **/
-  inline void set_size(const size_t num_blocks, const size_t block_size) { 
-    _dimensions=1; 
-    _num_blocks[0]=num_blocks*block_size; 
-    _block_size[0]=block_size; 
+  inline void set_size(const size_t num_blocks, const size_t block_size) {
+    _dimensions=1;
+    _num_blocks[0]=num_blocks*block_size;
+    _block_size[0]=block_size;
  }

  /// Set the number of thread blocks and the number of threads in each block
@ -266,36 +266,36 @@ class UCL_Kernel {
  /** \note This should be called before any arguments have been added
      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
-                       const size_t block_size_x, const size_t block_size_y) { 
-    _dimensions=2; 
-    _num_blocks[0]=num_blocks_x*block_size_x; 
-    _block_size[0]=block_size_x; 
-    _num_blocks[1]=num_blocks_y*block_size_y; 
-    _block_size[1]=block_size_y; 
+                       const size_t block_size_x, const size_t block_size_y) {
+    _dimensions=2;
+    _num_blocks[0]=num_blocks_x*block_size_x;
+    _block_size[0]=block_size_x;
+    _num_blocks[1]=num_blocks_y*block_size_y;
+    _block_size[1]=block_size_y;
  }
-  
+
  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue for the kernel is changed to cq **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
                       const size_t block_size_x, const size_t block_size_y,
-                       command_queue &cq) 
+                       command_queue &cq)
    {_cq=cq; set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y);}

  /// Set the number of thread blocks and the number of threads in each block
  /** \note This should be called before any arguments have been added
      \note The default command queue is used for the kernel execution **/
  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
-                       const size_t block_size_x, 
+                       const size_t block_size_x,
                       const size_t block_size_y, const size_t block_size_z) {
-    _dimensions=3; 
+    _dimensions=3;
    const size_t num_blocks_z=1;
-    _num_blocks[0]=num_blocks_x*block_size_x; 
-    _block_size[0]=block_size_x; 
-    _num_blocks[1]=num_blocks_y*block_size_y; 
-    _block_size[1]=block_size_y; 
-    _num_blocks[2]=num_blocks_z*block_size_z; 
-    _block_size[2]=block_size_z; 
+    _num_blocks[0]=num_blocks_x*block_size_x;
+    _block_size[0]=block_size_x;
+    _num_blocks[1]=num_blocks_y*block_size_y;
+    _block_size[1]=block_size_y;
+    _num_blocks[2]=num_blocks_z*block_size_z;
+    _block_size[2]=block_size_z;
  }

  /// Set the number of thread blocks and the number of threads in each block
@ -305,13 +305,13 @@ class UCL_Kernel {
                       const size_t block_size_x, const size_t block_size_y,
                       const size_t block_size_z, command_queue &cq) {
    _cq=cq;
-    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y, 
+    set_size(num_blocks_x, num_blocks_y, block_size_x, block_size_y,
             block_size_z);
  }
-  
+
  /// Run the kernel in the default command queue
  inline void run();
-  
+
  /// Clear any arguments associated with the kernel
  inline void clear_args() { _num_args=0; }

@ -320,7 +320,7 @@ class UCL_Kernel {
  /// Change the default command queue associated with matrix
  inline void cq(command_queue &cq_in) { _cq=cq_in; }
  #include "ucl_arg_kludge.h"
-  
+
 private:
  cl_kernel _kernel;
  cl_program _program;
@ -328,7 +328,7 @@ class UCL_Kernel {
  size_t _block_size[3];
  size_t _num_blocks[3];
  bool _function_set;
-  
+
  cl_command_queue _cq;        // The default command queue for this kernel
  unsigned _num_args;

@ -348,7 +348,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
  CL_SAFE_CALL(clRetainProgram(_program));
  cl_int error_flag;
  _kernel=clCreateKernel(program._program,function,&error_flag);
-  
+
  if (error_flag!=CL_SUCCESS) {
    #ifndef UCL_NO_EXIT
    std::cerr << "UCL Error: Could not find function: " << function
@ -357,7 +357,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
    #endif
    return UCL_FUNCTION_NOT_FOUND;
  }
-  
+
  #ifdef UCL_DEBUG
  _kernel_info_name=function;
  cl_uint nargs;
@ -375,7 +375,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
  #endif
  #endif

-  return UCL_SUCCESS;                                               
+  return UCL_SUCCESS;
 }

 void UCL_Kernel::run() {
--- a/lib/gpu/geryon/ocl_mat.h
+++ b/lib/gpu/geryon/ocl_mat.h
@ -17,12 +17,12 @@
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

 /*! \file */
-   
+
 #ifndef OCL_MAT_H
 #define OCL_MAT_H

@ -54,6 +54,6 @@ namespace ucl_opencl {
 #include "ucl_print.h"
 #undef UCL_PRINT_ALLOW

-} // namespace ucl_cudart 
+} // namespace ucl_cudart

 #endif
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -36,10 +36,10 @@ namespace ucl_opencl {
 // --------------------------------------------------------------------------
 struct ocl_kernel_dim {
  size_t x,y,z;
-  ocl_kernel_dim(size_t _x = 1, size_t _y = 1, size_t _z = 1) : 
+  ocl_kernel_dim(size_t _x = 1, size_t _y = 1, size_t _z = 1) :
    x(_x), y(_y), z(_z) {}
  operator size_t * () { return (size_t *)this; }
-  operator const size_t * () const { return (const size_t *)this; } 
+  operator const size_t * () const { return (const size_t *)this; }
 };
 typedef ocl_kernel_dim ucl_kernel_dim;

@ -53,13 +53,13 @@ typedef cl_mem device_ptr;
 // --------------------------------------------------------------------------

 template <class mat_type, class copy_type>
-inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,  
+inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
  cl_int error_flag;
  cl_context context;
  CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
                                  &context,NULL));
-  
+
  cl_mem_flags buffer_perm;
  cl_map_flags map_perm;
  if (kind2==UCL_NOT_SPECIFIED) {
@ -88,7 +88,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
      buffer_perm=CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR;
    else
      buffer_perm=CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR;
-    
+
    if (kind==UCL_READ_ONLY) {
      #ifdef CL_VERSION_1_2
      buffer_perm=buffer_perm | CL_MEM_HOST_READ_ONLY;
@ -102,9 +102,9 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
    } else
      map_perm=CL_MAP_READ | CL_MAP_WRITE;
  }
-    
+
  mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
-  if (error_flag != CL_SUCCESS) 
+  if (error_flag != CL_SUCCESS)
    return UCL_MEMORY_ERROR;
    *mat.host_ptr() = (typename mat_type::data_type*)
                      clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
@ -125,7 +125,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
  CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags),
                                  &orig_flags,NULL));
  orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
-  
+
  mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
                              *mat.host_ptr(), &error_flag);

@ -135,7 +135,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
 }

 template <class mat_type>
-inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,  
+inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
  cl_mem_flags buffer_perm;
  cl_map_flags map_perm;
@ -160,7 +160,7 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,

  cl_int error_flag;
  mat.cbegin()=clCreateBuffer(dev.context(),buffer_perm,n,NULL,&error_flag);
-  if (error_flag != CL_SUCCESS) 
+  if (error_flag != CL_SUCCESS)
    return UCL_MEMORY_ERROR;

  *mat.host_ptr() = (typename mat_type::data_type*)
@ -210,7 +210,7 @@ inline int _host_resize(mat_type &mat, const size_t n) {
    map_perm=CL_MAP_READ | CL_MAP_WRITE;

  mat.cbegin()=clCreateBuffer(context,buffer_perm,n,NULL,&error_flag);
-  if (error_flag != CL_SUCCESS) 
+  if (error_flag != CL_SUCCESS)
    return UCL_MEMORY_ERROR;
  *mat.host_ptr() = (typename mat_type::data_type*)
                    clEnqueueMapBuffer(mat.cq(),mat.cbegin(),CL_TRUE,
@ -248,7 +248,7 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
  else
    assert(0==1);
  mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
-  if (error_flag != CL_SUCCESS) 
+  if (error_flag != CL_SUCCESS)
    return UCL_MEMORY_ERROR;
  mat.cq()=cm.cq();
  CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
@ -278,7 +278,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
    assert(0==1);
  mat.cbegin()=clCreateBuffer(dev.context(),flag,n,NULL,
                              &error_flag);
-  if (error_flag != CL_SUCCESS) 
+  if (error_flag != CL_SUCCESS)
    return UCL_MEMORY_ERROR;
  mat.cq()=dev.cq();
  CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
@ -304,7 +304,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t rows,
  if (dev.device_type()!=UCL_CPU && cols%256!=0)
    padded_cols+=256-cols%256;
  pitch=padded_cols*sizeof(typename mat_type::data_type);
-  return _device_alloc(mat,dev,pitch*rows,kind);  
+  return _device_alloc(mat,dev,pitch*rows,kind);
 }

 template <class mat_type>
@ -342,7 +342,7 @@ inline int _device_resize(mat_type &mat, const size_t n) {
  else
    assert(0==1);
  mat.cbegin()=clCreateBuffer(context,flag,n,NULL,&error_flag);
-  if (error_flag != CL_SUCCESS) 
+  if (error_flag != CL_SUCCESS)
    return UCL_MEMORY_ERROR;
  return UCL_SUCCESS;
 }
@ -380,7 +380,7 @@ inline int _device_resize(mat_type &mat, const size_t rows,
  else
    assert(0==1);
  mat.cbegin()=clCreateBuffer(context,flag,pitch*rows,NULL,&error_flag);
-  if (error_flag != CL_SUCCESS) 
+  if (error_flag != CL_SUCCESS)
    return UCL_MEMORY_ERROR;
  return UCL_SUCCESS;
 }
@ -396,21 +396,21 @@ inline void _host_zero(void *ptr, const size_t n) {
 inline void _ocl_build(cl_program &program, cl_device_id &device,
                       const char* options = "") {
  clBuildProgram(program,1,&device,options,NULL,NULL);
-    
+
  cl_build_status build_status;
-  CL_SAFE_CALL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, 
+  CL_SAFE_CALL(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS,
                                     sizeof(cl_build_status),&build_status,
                                     NULL));
  if (build_status == CL_SUCCESS)
    return;
-    
+
  size_t ms;
-  CL_SAFE_CALL(clGetProgramBuildInfo(program, device,CL_PROGRAM_BUILD_LOG, 0, 
+  CL_SAFE_CALL(clGetProgramBuildInfo(program, device,CL_PROGRAM_BUILD_LOG, 0,
                                     NULL, &ms));
-  char build_log[ms];                                     
+  char build_log[ms];
  CL_SAFE_CALL(clGetProgramBuildInfo(program,device,CL_PROGRAM_BUILD_LOG,ms,
                                     build_log, NULL));
-    
+
  std::cerr << std::endl
            << "----------------------------------------------------------\n"
            << " Error compiling OpenCL Program...\n"
@ -423,13 +423,13 @@ inline void _ocl_kernel_from_source(cl_context &context, cl_device_id &device,
                                    cl_kernel &kernel, const char *function,
                                    const char *options="") {
  cl_int error_flag;
-  
+
  cl_program program=clCreateProgramWithSource(context,lines,source,
                                               NULL,&error_flag);
-  CL_CHECK_ERR(error_flag);                                               
+  CL_CHECK_ERR(error_flag);
  _ocl_build(program,device,options);
  kernel=clCreateKernel(program,function,&error_flag);
-  CL_CHECK_ERR(error_flag);                                               
+  CL_CHECK_ERR(error_flag);
 }

 template <class mat_type>
@ -452,17 +452,17 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
  cl_device_id device;
  CL_SAFE_CALL(clGetContextInfo(context,CL_CONTEXT_DEVICES,
               sizeof(cl_device_id),&device,NULL));
-  
+
  const char * szero[3]={
    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
    "__kernel void _device_zero(__global NUMTYP *a, const int offset)",
    "  { int gid=get_global_id(0)+offset; a[gid]=(NUMTYP)0; }"
  };
-  
+
  cl_kernel kzero;
  _ocl_kernel_from_source(context,device,szero,3,kzero,"_device_zero",
                   _UCL_DATA_ID<typename mat_type::data_type>::numtyp_flag());
-  
+
  cl_int offset=mat.offset();
  CL_SAFE_CALL(clSetKernelArg(kzero,0,sizeof(cl_mem),(void *)&mat.begin()));
  CL_SAFE_CALL(clSetKernelArg(kzero,1,sizeof(cl_int),(void *)&offset));
@ -486,7 +486,7 @@ template<> struct _ucl_memcpy<2,2> {
    assert(0==1);
  }
  template <class p1, class p2>
-  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                        const size_t spitch, const size_t cols,
                        const size_t rows, cl_command_queue &cq,
                        const cl_bool block,
@ -504,7 +504,7 @@ template<> struct _ucl_memcpy<2,0> {
    assert(0==1);
  }
  template <class p1, class p2>
-  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                        const size_t spitch, const size_t cols,
                        const size_t rows, cl_command_queue &cq,
                        const cl_bool block,
@ -522,7 +522,7 @@ template<> struct _ucl_memcpy<2,1> {
    assert(0==1);
  }
  template <class p1, class p2>
-  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                        const size_t spitch, const size_t cols,
                        const size_t rows, cl_command_queue &cq,
                        const cl_bool block,
@ -540,7 +540,7 @@ template<> struct _ucl_memcpy<0,2> {
    assert(0==1);
  }
  template <class p1, class p2>
-  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                        const size_t spitch, const size_t cols,
                        const size_t rows, cl_command_queue &cq,
                        const cl_bool block,
@ -558,7 +558,7 @@ template<> struct _ucl_memcpy<1,2> {
    assert(0==1);
  }
  template <class p1, class p2>
-  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                        const size_t spitch, const size_t cols,
                        const size_t rows, cl_command_queue &cq,
                        const cl_bool block,
@ -587,9 +587,9 @@ template <> struct _ucl_memcpy<1,0> {
                                     dst.begin(),0,NULL,NULL));
  }
  template <class p1, class p2>
-  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                        const size_t spitch, const size_t cols,
-                        const size_t rows, cl_command_queue &cq, 
+                        const size_t rows, cl_command_queue &cq,
                        const cl_bool block,
                        size_t dst_offset, size_t src_offset) {
    if (src.cbegin()==dst.cbegin()) {
@ -602,20 +602,20 @@ template <> struct _ucl_memcpy<1,0> {
    #ifdef UCL_DBG_MEM_TRACE
    std::cerr << "UCL_COPY 2NS\n";
    #endif
-    if (spitch==dpitch && dst.cols()==src.cols() && 
+    if (spitch==dpitch && dst.cols()==src.cols() &&
        src.cols()==cols/src.element_size())
      CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,
                                       spitch*rows,
                                       (char *)dst.begin()+dst_offset,0,NULL,
                                       NULL));
    else
-      for (size_t i=0; i<rows; i++) {                       
+      for (size_t i=0; i<rows; i++) {
        CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,cols,
                                         (char *)dst.begin()+dst_offset,0,NULL,
                                         NULL));
        src_offset+=spitch;
        dst_offset+=dpitch;
-      }                                       
+      }
  }
 };

@ -630,7 +630,7 @@ template <> struct _ucl_memcpy<0,1> {
      #ifdef UCL_DBG_MEM_TRACE
      std::cerr << "UCL_COPY 3S\n";
      #endif
-      return;                        
+      return;
    }
    #ifdef UCL_DBG_MEM_TRACE
    std::cerr << "UCL_COPY 3NS\n";
@ -639,9 +639,9 @@ template <> struct _ucl_memcpy<0,1> {
                                      src.begin(),0,NULL,NULL));
  }
  template <class p1, class p2>
-  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                        const size_t spitch, const size_t cols,
-                        const size_t rows, cl_command_queue &cq, 
+                        const size_t rows, cl_command_queue &cq,
                        const cl_bool block,
                        size_t dst_offset, size_t src_offset) {
    if (src.cbegin()==dst.cbegin()) {
@ -649,12 +649,12 @@ template <> struct _ucl_memcpy<0,1> {
      #ifdef UCL_DBG_MEM_TRACE
      std::cerr << "UCL_COPY 4S\n";
      #endif
-      return;                        
+      return;
    }
    #ifdef UCL_DBG_MEM_TRACE
    std::cerr << "UCL_COPY 4NS\n";
    #endif
-    if (spitch==dpitch && dst.cols()==src.cols() && 
+    if (spitch==dpitch && dst.cols()==src.cols() &&
        src.cols()==cols/src.element_size())
      CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,
                                        spitch*rows,
@ -667,7 +667,7 @@ template <> struct _ucl_memcpy<0,1> {
                                          NULL));
        src_offset+=spitch;
        dst_offset+=dpitch;
-      }                                       
+      }
  }
 };

@ -687,33 +687,33 @@ template <int mem1, int mem2> struct _ucl_memcpy {
    #ifdef UCL_DBG_MEM_TRACE
    else std::cerr << "UCL_COPY 6S\n";
    #endif
-    
+
    if (block==CL_TRUE) ucl_sync(cq);
  }
  template <class p1, class p2>
-  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
                        const size_t spitch, const size_t cols,
                        const size_t rows, cl_command_queue &cq,
                        const cl_bool block,
                        size_t dst_offset, size_t src_offset) {
-    if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {                        
+    if (src.cbegin()!=dst.cbegin() || src_offset!=dst_offset) {
      #ifdef UCL_DBG_MEM_TRACE
      std::cerr << "UCL_COPY 7NS\n";
      #endif
-      if (spitch==dpitch && dst.cols()==src.cols() && 
+      if (spitch==dpitch && dst.cols()==src.cols() &&
          src.cols()==cols/src.element_size())
        CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),src_offset,
                                         dst_offset,spitch*rows,0,NULL,NULL));
-        
+
      else
-        for (size_t i=0; i<rows; i++) {                       
+        for (size_t i=0; i<rows; i++) {
          CL_SAFE_CALL(clEnqueueCopyBuffer(cq,src.cbegin(),dst.cbegin(),
                                           src_offset,dst_offset,cols,0,
                                           NULL,NULL));
          src_offset+=spitch;
          dst_offset+=dpitch;
-        }                                       
-    }                                 
+        }
+    }
    #ifdef UCL_DBG_MEM_TRACE
    else std::cerr << "UCL_COPY 7S\n";
    #endif
@ -736,8 +736,8 @@ inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n,
 }

 template<class mat1, class mat2>
-inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, 
-                       const size_t spitch, const size_t cols, 
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+                       const size_t spitch, const size_t cols,
                       const size_t rows) {
  _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
                                                 rows,dst.cq(),CL_TRUE,
@ -745,15 +745,15 @@ inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
 }

 template<class mat1, class mat2>
-inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, 
-                           const size_t spitch, const size_t cols, 
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src,
+                           const size_t spitch, const size_t cols,
                           const size_t rows,cl_command_queue &cq) {
  _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
                                                 rows,cq,CL_FALSE,
                                                 dst.byteoff(),src.byteoff());
 }

-} // namespace ucl_cudart 
+} // namespace ucl_cudart

 #endif

--- a/lib/gpu/geryon/ocl_texture.h
+++ b/lib/gpu/geryon/ocl_texture.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -28,7 +28,7 @@
 #include "ocl_mat.h"

 namespace ucl_opencl {
-    
+
 /// Class storing a texture reference
 class UCL_Texture {
 public:
@ -46,9 +46,9 @@ class UCL_Texture {
  /// Unbind the texture reference from the memory allocation
  inline void unbind() { }

-  /// Make a texture reference available to kernel  
+  /// Make a texture reference available to kernel
  inline void allow(UCL_Kernel &kernel) { }
-  
+
 private:
  friend class UCL_Kernel;
 };
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -67,33 +67,33 @@ class UCL_Timer {
    clRetainCommandQueue(_cq);
    _initialized=true;
  }
-  
+
  /// Start timing on default command queue
  inline void start() { UCL_OCL_MARKER(_cq,&start_event); }
-  
+
  /// Stop timing on default command queue
  inline void stop() { UCL_OCL_MARKER(_cq,&stop_event); }
-  
+
  /// Block until the start event has been reached on device
-  inline void sync_start() 
+  inline void sync_start()
    { CL_SAFE_CALL(clWaitForEvents(1,&start_event)); }

  /// Block until the stop event has been reached on device
-  inline void sync_stop() 
+  inline void sync_stop()
    { CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); }

  /// Set the time elapsed to zero (not the total_time)
-  inline void zero() 
-    { UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); } 
-  
+  inline void zero()
+    { UCL_OCL_MARKER(_cq,&start_event); UCL_OCL_MARKER(_cq,&stop_event); }
+
  /// Set the total time to zero
  inline void zero_total() { _total_time=0.0; }
-  
+
  /// Add time from previous start and stop to total
  /** Forces synchronization **/
-  inline double add_to_total() 
+  inline double add_to_total()
    { double t=time(); _total_time+=t; return t/1000.0; }
-  
+
  /// Add a user specified time to the total (ms)
  inline void add_time_to_total(const double t) { _total_time+=t; }

@ -107,12 +107,12 @@ class UCL_Timer {
    CL_SAFE_CALL(clGetEventProfilingInfo(start_event,
                                         CL_PROFILING_COMMAND_END,
                                         sizeof(cl_ulong), &tstart, NULL));
-    return (tend-tstart)*t_factor; 
+    return (tend-tstart)*t_factor;
  }
-  
+
  /// Return the time (s) of last start to stop - Forces synchronization
  inline double seconds() { return time()/1000.0; }
-  
+
  /// Return the total time in ms
  inline double total_time() { return _total_time; }

--- a/lib/gpu/geryon/ucl_arg_kludge.h
+++ b/lib/gpu/geryon/ucl_arg_kludge.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -38,47 +38,47 @@

  template <class t1, class t2, class t3, class t4, class t5>
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
  }

  template <class t1, class t2, class t3, class t4, class t5,
            class t6>
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6);  
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6);
  }

  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7>
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7);
  }

  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8>
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8);  
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8);
  }

  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9>
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
  }

  template <class t1, class t2, class t3, class t4, class t5,
            class t6, class t7, class t8, class t9, class t10>
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
  }

  template <class t1, class t2, class t3, class t4, class t5,
@ -87,9 +87,9 @@
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11);
  }

  template <class t1, class t2, class t3, class t4, class t5,
@ -98,8 +98,8 @@
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
    add_arg(a11); add_arg(a12);
  }

@ -109,9 +109,9 @@
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13);  
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13);
  }

  template <class t1, class t2, class t3, class t4, class t5,
@ -120,9 +120,9 @@
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);  
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
  }

  template <class t1, class t2, class t3, class t4, class t5,
@ -131,9 +131,9 @@
  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
  }

  template <class t1, class t2, class t3, class t4, class t5,
@ -144,10 +144,10 @@
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16);
  }

  template <class t1, class t2, class t3, class t4, class t5,
@ -158,10 +158,10 @@
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17);
  }

  template <class t1, class t2, class t3, class t4, class t5,
@ -172,10 +172,10 @@
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18);
  }

  template <class t1, class t2, class t3, class t4, class t5,
@ -186,10 +186,10 @@
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
  }

  template <class t1, class t2, class t3, class t4, class t5,
@ -200,10 +200,10 @@
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
  }

  template <class t1, class t2, class t3, class t4, class t5,
@ -216,10 +216,10 @@
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
    add_arg(a21);
  }

@ -233,10 +233,10 @@
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
    add_arg(a21); add_arg(a22);
  }

@ -250,10 +250,10 @@
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
    add_arg(a21); add_arg(a22); add_arg(a23);
  }

@ -267,10 +267,10 @@
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
  }

@ -284,11 +284,11 @@
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
  }

  template <class t1, class t2, class t3, class t4, class t5,
@ -303,11 +303,11 @@
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
    add_arg(a26);
  }

@ -323,11 +323,11 @@
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
    add_arg(a26); add_arg(a27);
  }

@ -343,11 +343,11 @@
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27, t28 *a28) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
    add_arg(a26); add_arg(a27); add_arg(a28);
  }

@ -363,11 +363,11 @@
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
  }

@ -383,12 +383,12 @@
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
+    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
  }


@ -425,7 +425,7 @@
  template <class t1, class t2, class t3, class t4, class t5>
  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
    run();
  }

@ -434,8 +434,8 @@
  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6);  
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6);
    run();
  }

@ -444,8 +444,8 @@
  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7);
    run();
  }

@ -454,8 +454,8 @@
  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8);  
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8);
    run();
  }

@ -464,8 +464,8 @@
  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9);
    run();
  }

@ -474,8 +474,8 @@
  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
    run();
  }

@ -486,9 +486,9 @@
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11);
    run();
  }

@ -499,8 +499,8 @@
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
    add_arg(a11); add_arg(a12);
    run();
  }
@ -512,9 +512,9 @@
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13);  
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13);
    run();
  }

@ -525,9 +525,9 @@
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);  
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);
    run();
  }

@ -538,9 +538,9 @@
                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
    run();
  }

@ -553,10 +553,10 @@
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16);
    run();
  }

@ -569,10 +569,10 @@
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17);
    run();
  }

@ -585,10 +585,10 @@
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18);
    run();
  }

@ -601,10 +601,10 @@
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19);
    run();
  }

@ -617,10 +617,10 @@
                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
    run();
  }

@ -635,10 +635,10 @@
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
    add_arg(a21);
    run();
  }
@ -654,10 +654,10 @@
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
    add_arg(a21); add_arg(a22);
    run();
  }
@ -673,10 +673,10 @@
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
    add_arg(a21); add_arg(a22); add_arg(a23);
    run();
  }
@ -692,10 +692,10 @@
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
    run();
  }
@ -711,11 +711,11 @@
                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
    run();
  }

@ -732,11 +732,11 @@
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
    add_arg(a26);
    run();
  }
@ -754,11 +754,11 @@
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
    add_arg(a26); add_arg(a27);
    run();
  }
@ -776,12 +776,12 @@
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27, t28 *a28) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    add_arg(a26); add_arg(a27); add_arg(a28); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
+    add_arg(a26); add_arg(a27); add_arg(a28);
    run();
  }

@ -798,11 +798,11 @@
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
    run();
  }
@ -820,11 +820,11 @@
                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
                       t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
    clear_args();
-    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
-    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
-    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
-    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
-    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
-    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5);
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10);
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15);
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20);
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25);
+    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30);
    run();
  }
--- a/lib/gpu/geryon/ucl_basemat.h
+++ b/lib/gpu/geryon/ucl_basemat.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -52,10 +52,10 @@
 /// Base class for vector/matrix containers
 /** All containers are associated with a default command queue.
  * For CUDA, this is the default stream.
-  * 
-  * The default queue is used for asynchonrous operations on the container 
+  *
+  * The default queue is used for asynchonrous operations on the container
  * that do not specify a queue. For OpenCL, this queue is also used in
-  * calls for reserving and copying memory **/ 
+  * calls for reserving and copying memory **/
 class UCL_BaseMat {
 public:
  UCL_BaseMat() : _cq(0), _kind(UCL_VIEW) { }
@ -68,8 +68,8 @@ class UCL_BaseMat {
  inline void sync() { ucl_sync(_cq); }
  /// Return the type/permissions of memory allocation
  /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED
-    * or UCL_VIEW **/ 
-  inline enum UCL_MEMOPT kind() const { return _kind; }  
+    * or UCL_VIEW **/
+  inline enum UCL_MEMOPT kind() const { return _kind; }

  inline bool shared_mem_device() {
    #ifdef _OCL_MAT
@ -79,12 +79,12 @@ class UCL_BaseMat {
    cl_device_type device_type;
    CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
                                 sizeof(device_type),&device_type,NULL));
-    return _shared_mem_device(device_type);                                       
+    return _shared_mem_device(device_type);
    #else
    return false;
    #endif
  }
-  
+
 protected:
  command_queue _cq;
  enum UCL_MEMOPT _kind;
--- a/lib/gpu/geryon/ucl_copy.h
+++ b/lib/gpu/geryon/ucl_copy.h
@ -17,33 +17,33 @@
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */
-   
+
 /***************************************************************************
   The ucl_copy and ucl_cast_copy routines provide a general prototype for
   copying data between host and device memory (including texture memory)
   for the matrix and vector types in nvc_memory.
-   
-   For host/host and host/device transfers, typecasting is performed 
-   automatically as necessary. 
-   
-   The routines are written so that all branches can be removed by the 
+
+   For host/host and host/device transfers, typecasting is performed
+   automatically as necessary.
+
+   The routines are written so that all branches can be removed by the
   compiler during template instantiation.
-   
+
   The routines currently assume row-major ordering for all types.
-   
+
   For asynchronous copy in the default command queue, async is boolean true;
   For asynchronous copy in a specified command queue, async is command queue
   Otherwise, set async to boolean false;
-   
+
   When performing frequent data copies that require casting, it is more
   efficient to allocate a casting buffer once and then pass that buffer
   to the copy routine. This can be accomplished with the ucl_cast_copy
   routines.
-   
-   Examples 
+
+   Examples
      (x's represent alignment padding - to maintain alignment)
      (o's represent a larger matrix in memory)
      (vectors represented as single row)
@ -51,18 +51,18 @@
       dst           src            command
   ----------------------------------------------------------------
    0 1 2 3 4 <-- 0 1 2 3 4          ucl_copy(dst,src,async)
-    
+
    0 1 2 3   <-- 0 1 2 3 4          ucl_copy(dst,src,4,async)
-    
+
    0 1 2     <-- 0 1 2 3 4 5        ucl_copy(dst,src,async)
-    3 4 5 
-   
+    3 4 5
+
    0 1 2 3 4 5 <-- 0 1 2            ucl_copy(dst,src,async)
                    3 4 5
-                    
+
    0 1 2      <--  0 1 2            ucl_copy(dst,src,async)
    3 4 5           3 4 5
-    
+
    0 1 2      <--  0 1 2            ucl_copy(dst,src,6,async)
    3 4 5           3 4 5
                    5 6 7
@ -70,33 +70,33 @@
    0 1 2      <--  0  1  2  3       ucl_copy(dst,src,2,3,async)
    4 5 6           4  5  6  7
                    8  9  10 11
-    
+
    0 1 2 x x  <--  0 1 2            ucl_copy(dst,src,async)
    3 4 5 x x       3 4 5
-    
+
    0 1 2      <--  0 1 2 x x        ucl_copy(dst,src,async)
    3 4 5           3 4 5 x x
-    
+
    0 1 2 o o  <--  0 1 2            ucl_copy(dst,src,2,3,async)
    3 4 5 o o       3 4 5
-    o o o o o       
+    o o o o o

    0 1 2 o o  <--  0 1 2 3 4 5      ucl_copy(dst,src,2,3,async)
-    3 4 5 o o       
-    o o o o o       
+    3 4 5 o o
+    o o o o o

    0 1 o o o  <--  0 1 2 3 4 5      ucl_copy(dst,src,2,2,async)
-    2 3 o o o       
-    o o o o o       
+    2 3 o o o
+    o o o o o

    0 1 2 o o  <--  0  1  2  3  4    ucl_copy(dst,src,2,3,async)
    5 6 7 o o       5  6  7  8  9
    o o o o o       10 11 12 13 14
-    
+
    0 1 2 5 6 7  <--  0  1  2  3  4  ucl_copy(dst,src,2,3,async)
                      5  6  7  8  9
                      10 11 12 13 14
-    
+
 ***************************************************************************/

 // Only allow this file to be included by nvc_memory.h and ocl_memory.h
@ -124,7 +124,7 @@ inline void _check_ucl_copy_perm(mat1 &dst, mat2 &src) {
      assert(0==1);
    }
  }
-} 
+}

 // --------------------------------------------------------------------------
 // - HOST-HOST COPY ROUTINES
@ -182,7 +182,7 @@ template <> struct _host_host_copy<1,1> {
        return;
      }
      #endif
-      
+
      #ifdef UCL_DBG_MEM_TRACE
      std::cerr << "UCL_COPY 8NS\n";
      #endif
@ -212,7 +212,7 @@ template <int host_t1, int host_t2> struct _host_host_copy {
  static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
                         const size_t cols) {
    assert(0==1);
-  }                         
+  }
 };

 // --------------------------------------------------------------------------
@ -242,20 +242,20 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
  template <class mat1, class mat2, class mat3>
  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
                        const size_t cols, mat3 &cast_buffer) {
-    // Asynchronous currently pointless here 
+    // Asynchronous currently pointless here
    #ifdef UCL_DEBUG
    assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
    assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
-    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);    
-    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);    
-    #endif    
+    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
+    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
+    #endif
    if (mat1::VECTOR) {
      ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
                 src.row_bytes(),cols*sizeof(typename mat2::data_type),rows);
      for (size_t i=0; i<rows*cols; i++)
        dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
    } else {
-      if (mat2::VECTOR) 
+      if (mat2::VECTOR)
        ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
                   cols*sizeof(typename mat2::data_type),
                   cols*sizeof(typename mat2::data_type),rows);
@ -276,23 +276,23 @@ template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
  }
  template <class mat1, class mat2, class mat3>
  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer, 
+                        const size_t cols, mat3 &cast_buffer,
                        command_queue &cq) {
-    // Asynchronous currently pointless here 
+    // Asynchronous currently pointless here
    #ifdef UCL_DEBUG
    assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
    assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
-    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);    
-    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);    
-    #endif    
+    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
+    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
+    #endif
    if (mat1::VECTOR) {
      ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
                 src.row_bytes(),cols*sizeof(typename mat2::data_type),rows,cq);
-      cast_buffer.sync();           
+      cast_buffer.sync();
      for (size_t i=0; i<rows*cols; i++)
        dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
    } else {
-      if (mat2::VECTOR) 
+      if (mat2::VECTOR)
        ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
                   cols*sizeof(typename mat2::data_type),
                   cols*sizeof(typename mat2::data_type),rows,cq);
@ -338,7 +338,7 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
    assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
-    if (mat3::VECTOR==0) { 
+    if (mat3::VECTOR==0) {
      assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
      assert(dst.rows()>=rows && dst.cols()>=cols);
    }
@ -404,9 +404,9 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
    #ifdef UCL_DEBUG
    assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
    assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
-    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);    
-    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);    
-    if (mat3::VECTOR==0) { 
+    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
+    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
+    if (mat3::VECTOR==0) {
      assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols);
      assert(dst.rows()>=rows && dst.cols()>=cols);
    }
@ -472,23 +472,23 @@ template <> struct _ucl_cast_copy<1,1> {
  template <class mat1, class mat2, class mat3>
  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
                        mat3 &cast_buffer, command_queue &cq) {
-    assert(0==1);                        
+    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
                        mat3 &cast_buffer) {
-    assert(0==1);                        
+    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
                        const size_t cols, mat3 &cast_buffer) {
-    assert(0==1);                        
+    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
                        const size_t cols, mat3 &cast_buffer,
                        command_queue &cq) {
-    assert(0==1);                        
+    assert(0==1);
  }
 };

@ -497,23 +497,23 @@ template <> struct _ucl_cast_copy<0,0> {
  template <class mat1, class mat2, class mat3>
  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
                        mat3 &cast_buffer, command_queue &cq) {
-    assert(0==1);                        
+    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
                        mat3 &cast_buffer) {
-    assert(0==1);                        
+    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
                        const size_t cols, mat3 &cast_buffer) {
-    assert(0==1);                        
+    assert(0==1);
  }
  template <class mat1, class mat2, class mat3>
  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
                        const size_t cols, mat3 &cast_buffer,
                        command_queue &cq) {
-    assert(0==1);                        
+    assert(0==1);
  }
 };

@ -525,7 +525,7 @@ template <> struct _ucl_cast_copy<0,0> {
 /** \param numel Number of elements (not bytes) to copy
  * \param cast_buffer Buffer on host with enough storage for casting
  * - If the data types for the two matrices are same, no cast performed
-  * - Padding for 2D matrices is not considered in this routine. 
+  * - Padding for 2D matrices is not considered in this routine.
  * - Currently does not handle textures **/
 template <class mat1, class mat2, class mat3>
 inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
@ -551,7 +551,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
  * \param async Perform non-blocking copy on default stream
  * \param cast_buffer Buffer on host with enough storage for casting
  * - If the data types for the two matrices are same, no cast performed
-  * - Padding for 2D matrices is not considered in this routine. 
+  * - Padding for 2D matrices is not considered in this routine.
  * - Currently does not handle textures **/
 template <class mat1, class mat2, class mat3>
 inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
@ -580,7 +580,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
  *   buffer is created for copy. When multiple casts occur, it is
  *   more efficient to create a permanent casting buffer that can
  *   be passed to an alternative  copy routine.
-  * - Padding for 2D matrices is not considered in this routine. 
+  * - Padding for 2D matrices is not considered in this routine.
  * - Currently does not handle textures **/
 template <class mat1, class mat2>
 inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
@ -593,7 +593,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
  #endif
  if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
    _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
-  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && 
+  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
      (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
    if (mat1::MEM_TYPE==1) {
      UCL_H_Vec<typename mat2::data_type> cast_buffer;
@ -606,8 +606,8 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
                                                        cast_buffer,cq);
    }
-  } else 
-    ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq); 
+  } else
+    ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq);
 }

 /// Copy matrix/vector (memory already allocated)
@ -619,7 +619,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
  *   buffer is created for copy. When multiple casts occur, it is
  *   more efficient to create a permanent casting buffer that can
  *   be passed to an alternative  copy routine.
-  * - Padding for 2D matrices is not considered in this routine. 
+  * - Padding for 2D matrices is not considered in this routine.
  * - The default stream is used for asynchronous copy
  * - Currently does not handle textures **/
 template <class mat1, class mat2>
@ -648,7 +648,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
                                                        cast_buffer);
    }
  } else
-    ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type)); 
+    ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type));
 }

 // --------------------------------------------------------------------------
@ -659,11 +659,11 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
 /** \param async Perform non-blocking copy on default stream
  * \param cast_buffer Buffer on host with enough storage for casting
  * - If src is a vector, routine assumes row-major rows by cols copy
-  * - If src is a matrix, routine will copy upper left tile of matrix 
+  * - If src is a matrix, routine will copy upper left tile of matrix
  * - If dst is a vector, routine assumes row-major rows by cols copy
-  * - If dst is a matrix, routine will copy into left tile of matrix 
+  * - If dst is a matrix, routine will copy into left tile of matrix
  * - If the data types for the two matrices are same, no cast performed
-  * - Padding for 2D matrices is not considered in this routine. 
+  * - Padding for 2D matrices is not considered in this routine.
  * - Copy from vector to matrix and vice versa allowed
  * - Currently does not handle textures **/
 template <class mat1, class mat2, class mat3>
@ -686,16 +686,16 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
 /// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
 /** \param cast_buffer Buffer on host with enough storage for casting
  * - If src is a vector, routine assumes row-major rows by cols copy
-  * - If src is a matrix, routine will copy upper left tile of matrix 
+  * - If src is a matrix, routine will copy upper left tile of matrix
  * - If dst is a vector, routine assumes row-major rows by cols copy
-  * - If dst is a matrix, routine will copy into upper left tile of matrix 
+  * - If dst is a matrix, routine will copy into upper left tile of matrix
  * - If the data types for the two matrices are same, no cast performed
-  * - Padding for 2D matrices is not considered in this routine. 
+  * - Padding for 2D matrices is not considered in this routine.
  * - Copy from vector to matrix and vice versa allowed
  * - Currently does not handle textures **/
 template <class mat1, class mat2, class mat3>
 inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
-                          const size_t cols, mat3 &cast_buffer, 
+                          const size_t cols, mat3 &cast_buffer,
                          command_queue &cq) {
  if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
    ucl_copy(dst,src,rows,cols,cq);
@ -710,11 +710,11 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,

 /// Asynchronous copy of subset matrix rows,cols (memory already allocated)
 /** - If src is a vector, routine assumes row-major rows by cols copy
-  * - If src is a matrix, routine will copy upper left tile of matrix 
+  * - If src is a matrix, routine will copy upper left tile of matrix
  * - If dst is a vector, routine assumes row-major rows by cols copy
-  * - If dst is a matrix, routine will copy into left tile of matrix 
+  * - If dst is a matrix, routine will copy into left tile of matrix
  * - If the data types of the two matrices are not the same,
-  *   casting will be performed automatically as long as the copy is 
+  *   casting will be performed automatically as long as the copy is
  *   not device to device. For host/device transfers, a temporary
  *   buffer is created for copy. When multiple casts occur, it is
  *   more efficient to create a permanent casting buffer that can
@ -730,7 +730,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
  #endif
  if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
    _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
-  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && 
+  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
           (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
    if (mat1::MEM_TYPE==1) {
      UCL_H_Vec<typename mat2::data_type> cast_buffer;
@ -773,9 +773,9 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
 /// Copy subset of matrix rows,cols (memory already allocated)
 /** \param async Perform non-blocking copy (ignored for host to host copy)
  * - If src is a vector, routine assumes row-major rows by cols copy
-  * - If src is a matrix, routine will copy upper left tile of matrix 
+  * - If src is a matrix, routine will copy upper left tile of matrix
  * - If dst is a vector, routine assumes row-major rows by cols copy
-  * - If dst is a matrix, routine will copy into left tile of matrix 
+  * - If dst is a matrix, routine will copy into left tile of matrix
  * - If the data types of the two matrices are not the same,
  *   casting will be performed automatically as long as the copy is
  *   not device to device. For host/device transfers, a temporary
@ -796,7 +796,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
    ucl_copy(dst,src,rows,cols,dst.cq());
  else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
    _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
-  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && 
+  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
           (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
    if (mat1::MEM_TYPE==1) {
      UCL_H_Vec<typename mat2::data_type> cast_buffer;
@ -846,7 +846,7 @@ inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
  * \param cast_buffer Buffer on host with enough storage for casting
  * - If the data types for the two matrices are same, no cast performed
  * - The number of bytes copied is determined by entire src data
-  * - Padding for 2D matrices is not considered in this routine. 
+  * - Padding for 2D matrices is not considered in this routine.
  * - Copy from vector to matrix and vice versa allowed
  * - Currently does not handle textures **/
 template <class mat1, class mat2, class mat3>
@ -866,7 +866,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
 /** \param cast_buffer Buffer on host with enough storage for casting
  * - If the data types for the two matrices are same, no cast performed
  * - The number of bytes copied is determined by entire src data
-  * - Padding for 2D matrices is not considered in this routine. 
+  * - Padding for 2D matrices is not considered in this routine.
  * - Copy from vector to matrix and vice versa allowed
  * - Currently does not handle textures **/
 template <class mat1, class mat2, class mat3>
@ -885,7 +885,7 @@ inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
 /// Asynchronous copy of matrix/vector (memory already allocated)
 /** - The number of bytes copied is determined by entire src data
  * - If the data types of the two matrices are not the same,
-  *   casting will be performed automatically as long as the copy is 
+  *   casting will be performed automatically as long as the copy is
  *   not device to device. For host/device transfers, a temporary
  *   buffer is created for copy. When multiple casts occur, it is
  *   more efficient to create a permanent casting buffer that can
@ -924,7 +924,7 @@ template <class mat1, class mat2>
 inline void ucl_copy(mat1 &dst, const mat2 &src, const bool async) {
  if (async)
    ucl_copy(dst,src,dst.cq());
-  else if (dst.row_bytes()==src.row_bytes() && 
+  else if (dst.row_bytes()==src.row_bytes() &&
           src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
           (int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
    ucl_copy(dst,src,src.row_size()*src.rows(),async);
--- a/lib/gpu/geryon/ucl_d_mat.h
+++ b/lib/gpu/geryon/ucl_d_mat.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -37,23 +37,23 @@ class UCL_D_Mat : public UCL_BaseMat {
    ROW_MAJOR = 1,
    VECTOR = 0
  };
-  typedef numtyp data_type; 
+  typedef numtyp data_type;

  UCL_D_Mat() : _cols(0) {}
  ~UCL_D_Mat() { _device_free(*this); }
-  
+
  /// Construct with specified rows and cols
  /** \sa alloc() **/
  UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device,
-            const enum UCL_MEMOPT kind=UCL_READ_WRITE) : 
+            const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
    _cols(0) { alloc(rows,cols,device,kind); }
-    
+
  /// Row major matrix on device
  /** The kind parameter controls memory optimizations as follows:
    * - UCL_READ_WRITE - Specify that you will read and write in kernels
    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
    * - UCL_READ_ONLY  - Specify that you will only read in kernels
-    * \param cq Default command queue for operations copied from another mat 
+    * \param cq Default command queue for operations copied from another mat
    * \note - Coalesced access using adjacent cols on same row
    *         UCL_D_Mat(row,col) given by array[row*row_size()+col]
    * \return UCL_SUCCESS if the memory allocation is successful **/
@ -65,7 +65,7 @@ class UCL_D_Mat : public UCL_BaseMat {
    int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
    if (err!=UCL_SUCCESS) {
      #ifndef UCL_NO_EXIT
-      std::cerr << "UCL Error: Could not allocate " 
+      std::cerr << "UCL Error: Could not allocate "
                << rows*cols*sizeof(numtyp) << " bytes on device.\n";
      UCL_GERYON_EXIT;
      #endif
@ -82,9 +82,9 @@ class UCL_D_Mat : public UCL_BaseMat {
    #ifdef _OCL_MAT
    _offset=0;
    #endif
-    return err; 
+    return err;
  }
-  
+
  /// Row major matrix on device
  /** The kind parameter controls memory optimizations as follows:
    * - UCL_READ_WRITE - Specify that you will read and write in kernels
@ -118,15 +118,15 @@ class UCL_D_Mat : public UCL_BaseMat {
    #ifdef _OCL_MAT
    _offset=0;
    #endif
-    return err; 
+    return err;
  }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * \param stride Number of _elements_ between the start of each row **/
  template <class ucl_type>
  inline void view(ucl_type &input, const size_t rows, const size_t cols,
                   const size_t stride) {
@ -145,7 +145,7 @@ class UCL_D_Mat : public UCL_BaseMat {
    #else
    _device_view(&_array,input.begin());
    #endif
-    
+
    #ifndef _UCL_DEVICE_PTR_MAT
    _end=_array+_cols;
    #endif
@ -157,39 +157,39 @@ class UCL_D_Mat : public UCL_BaseMat {
    * - The view does not prevent the memory from being freed by the
    *   allocating container when using CUDA APIs **/
  template <class ucl_type>
-  inline void view(ucl_type &input, const size_t rows, const size_t cols) 
+  inline void view(ucl_type &input, const size_t rows, const size_t cols)
    { view(input,rows,cols,input.row_size()); }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view **/
  template <class ucl_type>
  inline void view(ucl_type &input, const size_t cols)
    { view(input,1,cols); }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view **/
  template <class ucl_type>
-  inline void view(ucl_type &input) 
+  inline void view(ucl_type &input)
    { view(input,input.rows(),input.cols()); }
-  
+
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * \param stride Number of _elements_ between the start of each row **/
  template <class ptr_type>
  inline void view(ptr_type input, const size_t rows, const size_t cols,
-                   const size_t stride, UCL_Device &dev) { 
+                   const size_t stride, UCL_Device &dev) {
    clear();
    _kind=UCL_VIEW;
    _cols=cols;
@ -215,7 +215,7 @@ class UCL_D_Mat : public UCL_BaseMat {
  template <class ptr_type>
  inline void view(ptr_type input, const size_t rows, const size_t cols,
                   UCL_Device &dev) { view(input,rows,cols,cols,dev); }
-  
+
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
@ -223,13 +223,13 @@ class UCL_D_Mat : public UCL_BaseMat {
  template <class ptr_type>
  inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
    { view(input,1,cols,dev); }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * \param stride Number of _elements_ between the start of each row **/
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
                          const size_t cols, const size_t stride) {
@ -248,7 +248,7 @@ class UCL_D_Mat : public UCL_BaseMat {
    #else
    _device_view(&_array,input.begin(),offset,sizeof(numtyp));
    #endif
-    
+
    #ifndef _UCL_DEVICE_PTR_MAT
    _end=_array+_cols;
    #endif
@ -261,45 +261,45 @@ class UCL_D_Mat : public UCL_BaseMat {
    *   allocating container when using CUDA APIs **/
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
-                          const size_t cols) 
+                          const size_t cols)
    { view_offset(offset,input,rows,cols,input.row_size()); }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view **/
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
    { view_offset(offset,input,1,cols); }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view **/
  template <class ucl_type>
-  inline void view_offset(const size_t offset, ucl_type &input) { 
-    if (input.rows()==1) 
+  inline void view_offset(const size_t offset, ucl_type &input) {
+    if (input.rows()==1)
      view_offset(offset,input,1,input.cols()-offset);
-    else 
+    else
      view_offset(offset,input,input.rows()-offset/input.row_size(),
                  input.cols());
  }
-    
+
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * \param stride Number of _elements_ between the start of each row **/
  template <class ptr_type>
  inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
                          const size_t cols,const size_t stride,
-                          UCL_Device &dev) { 
+                          UCL_Device &dev) {
    clear();
    _kind=UCL_VIEW;
    _cols=cols;
@ -307,7 +307,7 @@ class UCL_D_Mat : public UCL_BaseMat {
    _pitch=stride*sizeof(numtyp);
    _row_size=stride;
    this->_cq=dev.cq();
-    
+
    #ifdef _OCL_MAT
    _array=input;
    _offset=offset;
@ -320,7 +320,7 @@ class UCL_D_Mat : public UCL_BaseMat {
    _array=input+offset;
    #endif
    #endif
-    
+
    #ifndef _UCL_DEVICE_PTR_MAT
    _end=_array+_cols;
    #endif
@ -332,20 +332,20 @@ class UCL_D_Mat : public UCL_BaseMat {
    *   allocating container when using CUDA APIs **/
  template <class ptr_type>
  inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
-                          const size_t cols, UCL_Device &dev) 
+                          const size_t cols, UCL_Device &dev)
    { view_offset(offset,input,rows,cols,cols,dev); }
-  
+
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
    *   allocating container when using CUDA APIs **/
  template <class ptr_type>
-  inline void view_offset(const size_t offset, ptr_type input, 
+  inline void view_offset(const size_t offset, ptr_type input,
                          const size_t cols, UCL_Device &dev)
    { view_offset(offset,input,1,cols,dev); }
-  
+
  /// Free memory and set size to 0
-  inline void clear() 
+  inline void clear()
    { _device_free(*this); _cols=0; _kind=UCL_VIEW; }

  /// Resize the allocation to contain cols elements
@ -356,7 +356,7 @@ class UCL_D_Mat : public UCL_BaseMat {
    int err=_device_resize(*this,rows,cols,_pitch);
    if (err!=UCL_SUCCESS) {
      #ifndef UCL_NO_EXIT
-      std::cerr << "UCL Error: Could not allocate " 
+      std::cerr << "UCL Error: Could not allocate "
                << rows*cols*sizeof(numtyp) << " bytes on device.\n";
      UCL_GERYON_EXIT;
      #endif
@ -372,13 +372,13 @@ class UCL_D_Mat : public UCL_BaseMat {
    #ifdef _OCL_MAT
    _offset=0;
    #endif
-    return err; 
+    return err;
  }
-    
+
  /// Resize (only if bigger) the allocation to contain rows x cols elements
  /** \note Cannot be used on views **/
  inline int resize_ib(const int rows, const int cols)
-    { if (cols>_cols || rows>_rows) return resize(rows,cols); 
+    { if (cols>_cols || rows>_rows) return resize(rows,cols);
      else return UCL_SUCCESS; }

  /// Set each element to zero asynchronously in the default command_queue
@ -386,10 +386,10 @@ class UCL_D_Mat : public UCL_BaseMat {
  /// Set first n elements to zero asynchronously in the default command_queue
  inline void zero(const int n) { zero(n,_cq); }
  /// Set each element to zero asynchronously
-  inline void zero(command_queue &cq) 
+  inline void zero(command_queue &cq)
    { _device_zero(*this,row_bytes()*_rows,cq); }
  /// Set first n elements to zero asynchronously
-  inline void zero(const int n, command_queue &cq) 
+  inline void zero(const int n, command_queue &cq)
    { _device_zero(*this,n*sizeof(numtyp),cq); }


@ -445,7 +445,7 @@ class UCL_D_Mat : public UCL_BaseMat {
  inline size_t row_bytes() const { return _pitch; }
  /// Get the size in bytes of 1 element
  inline int element_size() const { return sizeof(numtyp); }
-  
+
  #ifdef _OCL_MAT
  /// Return the offset (in elements) from begin() pointer where data starts
  /** \note Always 0 for host matrices and CUDA APIs **/
@ -459,7 +459,7 @@ class UCL_D_Mat : public UCL_BaseMat {
  /// Return the offset (in bytes) from begin() pointer where data starts
  /** \note Always 0 for host matrices and CUDA APIs **/
  inline size_t byteoff() const { return offset()*sizeof(numtyp); }
-  
+
 private:
  size_t _pitch, _row_size, _rows, _cols;

--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@ -17,14 +17,14 @@
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

 // Only allow this file to be included by CUDA and OpenCL specific headers
 #ifdef _UCL_MAT_ALLOW

-/// Row vector on device 
+/// Row vector on device
 template <class numtyp>
 class UCL_D_Vec : public UCL_BaseMat {
 public:
@ -37,7 +37,7 @@ class UCL_D_Vec : public UCL_BaseMat {
    ROW_MAJOR = 1,
    VECTOR = 1
  };
-  typedef numtyp data_type; 
+  typedef numtyp data_type;

  UCL_D_Vec() : _cols(0) {}
  ~UCL_D_Vec() { _device_free(*this); }
@ -45,7 +45,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  /// Construct with n columns
  /** \sa alloc() **/
  UCL_D_Vec(const size_t n, UCL_Device &device,
-            const enum UCL_MEMOPT kind=UCL_READ_WRITE) : 
+            const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
    _cols(0) { alloc(n,device,kind); }

  /// Set up host vector with 'cols' columns and reserve memory
@ -58,7 +58,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  template <class mat_type>
  inline int alloc(const size_t cols, mat_type &cq,
                   const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
-                        
+
    clear();

    _row_bytes=cols*sizeof(numtyp);
@ -82,8 +82,8 @@ class UCL_D_Vec : public UCL_BaseMat {
    #ifdef _OCL_MAT
    _offset=0;
    #endif
-    return err; 
-  }    
+    return err;
+  }

  /// Set up host vector with 'cols' columns and reserve memory
  /** The kind parameter controls memory optimizations as follows:
@ -116,7 +116,7 @@ class UCL_D_Vec : public UCL_BaseMat {
    #ifdef _OCL_MAT
    _offset=0;
    #endif
-    return err; 
+    return err;
  }

  /// Do not allocate memory, instead use an existing allocation from Geryon
@ -142,18 +142,18 @@ class UCL_D_Vec : public UCL_BaseMat {
    #else
    _device_view(&_array,input.begin());
    #endif
-    
+
    #ifndef _UCL_DEVICE_PTR_MAT
    _end=_array+_cols;
    #endif
  }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * \param stride Number of _elements_ between the start of each row **/
  template <class ucl_type>
  inline void view(ucl_type &input, const size_t rows, const size_t cols,
                   const size_t stride) { view(input,rows,cols); }
@ -162,24 +162,24 @@ class UCL_D_Vec : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view **/
  template <class ucl_type>
  inline void view(ucl_type &input, const size_t cols)
    { view(input,1,cols); }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view **/
  template <class ucl_type>
-  inline void view(ucl_type &input) 
+  inline void view(ucl_type &input)
    { view(input,input.rows()*input.row_size()); }
-  
+
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
@ -205,15 +205,15 @@ class UCL_D_Vec : public UCL_BaseMat {
    CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
    #endif
  }
-  
+
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * \param stride Number of _elements_ between the start of each row **/
  template <class ptr_type>
  inline void view(ptr_type input, const size_t rows, const size_t cols,
-                   const size_t stride, UCL_Device &dev) 
+                   const size_t stride, UCL_Device &dev)
    { view(input,rows,cols,stride); }

  /// Do not allocate memory, instead use an existing allocation
@ -223,7 +223,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  template <class ptr_type>
  inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
    { view(input,1,cols,dev); }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
@ -248,45 +248,45 @@ class UCL_D_Vec : public UCL_BaseMat {
    #else
    _device_view(&_array,input.begin(),offset,sizeof(numtyp));
    #endif
-    
+
    #ifndef _UCL_DEVICE_PTR_MAT
    _end=_array+_cols;
    #endif
  }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * \param stride Number of _elements_ between the start of each row **/
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
-                          const size_t cols, const size_t stride) 
+                          const size_t cols, const size_t stride)
    { view_offset(offset,input,rows,cols); }

  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view **/
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
    { view_offset(offset,input,1,cols); }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
    * - If a matrix is used a input, all elements (including padding)
    *   will be used for view **/
  template <class ucl_type>
-  inline void view_offset(const size_t offset, ucl_type &input) 
+  inline void view_offset(const size_t offset, ucl_type &input)
    { view_offset(offset,input,input.rows()*input.row_size()-offset); }
-  
+
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
@ -302,7 +302,7 @@ class UCL_D_Vec : public UCL_BaseMat {
    _cols=cols;
    _row_bytes=_cols*sizeof(numtyp);
    this->_cq=dev.cq();
-    
+
    #ifdef _OCL_MAT
    _array=input;
    _offset=offset;
@ -315,20 +315,20 @@ class UCL_D_Vec : public UCL_BaseMat {
    _array=input+offset;
    #endif
    #endif
-    
+
    #ifndef _UCL_DEVICE_PTR_MAT
    _end=_array+_cols;
    #endif
  }
-  
+
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * \param stride Number of _elements_ between the start of each row **/
  template <class ptr_type>
  inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
-                          const size_t cols,const size_t stride,UCL_Device &dev) 
+                          const size_t cols,const size_t stride,UCL_Device &dev)
    { view_offset(offset,input,rows,cols,stride); }

  /// Do not allocate memory, instead use an existing allocation
@ -336,12 +336,12 @@ class UCL_D_Vec : public UCL_BaseMat {
    * - The view does not prevent the memory from being freed by the
    *   allocating container when using CUDA APIs **/
  template <class ptr_type>
-  inline void view_offset(const size_t offset, ptr_type input, 
+  inline void view_offset(const size_t offset, ptr_type input,
                          const size_t cols, UCL_Device &dev)
    { view_offset(offset,input,1,cols,dev); }
-  
+
  /// Free memory and set size to 0
-  inline void clear() 
+  inline void clear()
    { _device_free(*this); _cols=0; _kind=UCL_VIEW;  }

  /// Resize the allocation to contain cols elements
@ -369,9 +369,9 @@ class UCL_D_Vec : public UCL_BaseMat {
    #ifdef _OCL_MAT
    _offset=0;
    #endif
-    return err; 
+    return err;
  }
-    
+
  /// Resize (only if bigger) the allocation to contain cols elements
  /** \note Cannot be used on views **/
  inline int resize_ib(const int cols)
@ -384,7 +384,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  /// Set each element to zero asynchronously
  inline void zero(command_queue &cq) { _device_zero(*this,row_bytes(),cq); }
  /// Set first n elements to zero asynchronously
-  inline void zero(const int n, command_queue &cq) 
+  inline void zero(const int n, command_queue &cq)
    { _device_zero(*this,n*sizeof(numtyp),cq); }

  #ifdef _UCL_DEVICE_PTR_MAT
@ -402,7 +402,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  /// For CUDA-RT, get device pointer to one past last element
  inline numtyp * end() const { return _end; }
  #endif
-  
+
  #ifdef _UCL_DEVICE_PTR_MAT
  /// Returns an API specific device pointer
  /** - For OpenCL, returns a &cl_mem object
@ -427,10 +427,10 @@ class UCL_D_Vec : public UCL_BaseMat {
  inline const numtyp ** cbegin() const { return &_array; }
  /// For CUDA-RT, allocate row vector and bind texture
  inline void safe_alloc(const size_t cols, UCL_Device &dev,
-                         textureReference *t) 
+                         textureReference *t)
    { alloc(cols,dev); assign_texture(t); bind(); }
  /// For CUDA-RT, assign a texture to matrix
-  inline void assign_texture(textureReference *t) { _tex_ptr=t; }  
+  inline void assign_texture(textureReference *t) { _tex_ptr=t; }
  /// For CUDA-RT, bind to texture
  inline void bind() {
    cuda_gb_get_channel<numtyp>(_channel);
@ -456,7 +456,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  inline size_t row_bytes() const { return _row_bytes; }
  /// Get the size in bytes of 1 element
  inline int element_size() const { return sizeof(numtyp); }
-  
+
  #ifdef _OCL_MAT
  /// Return the offset (in elements) from begin() pointer where data starts
  /** \note Always 0 for host matrices and CUDA APIs **/
@ -473,7 +473,7 @@ class UCL_D_Vec : public UCL_BaseMat {

 private:
  size_t _row_bytes, _row_size, _rows, _cols;
-  
+
  #ifdef _UCL_DEVICE_PTR_MAT
  device_ptr _array;
  #else
--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -37,21 +37,21 @@ class UCL_H_Mat : public UCL_BaseMat {
     ROW_MAJOR = 1,
     VECTOR = 0
   };
-   typedef numtyp data_type; 
-   
+   typedef numtyp data_type;
+
  UCL_H_Mat() : _cols(0) {
    #ifdef _OCL_MAT
    _carray=(cl_mem)(0);
    #endif
  }
  ~UCL_H_Mat() { _host_free(*this); }
-  
+
  /// Construct with specied number of rows and columns
  /** \sa alloc() **/
-  UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device, 
-            const enum UCL_MEMOPT kind=UCL_READ_WRITE) 
+  UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device,
+            const enum UCL_MEMOPT kind=UCL_READ_WRITE)
    { _cols=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
-  
+
  /// Set up host matrix with specied # of rows/cols and reserve memory
  /** The kind parameter controls memory pinning as follows:
    * - UCL_READ_WRITE - Specify that you will read and write from host
@ -74,7 +74,7 @@ class UCL_H_Mat : public UCL_BaseMat {
                << " bytes on host.\n";
      _row_bytes=0;
      UCL_GERYON_EXIT;
-      #endif 
+      #endif
      _row_bytes=0;
      return err;
    }
@ -84,7 +84,7 @@ class UCL_H_Mat : public UCL_BaseMat {
    _kind=kind;
    _end=_array+rows*cols;
    return err;
-  }    
+  }

  /// Set up host matrix with specied # of rows/cols and reserve memory
  /** The kind parameter controls memory pinning as follows:
@ -117,15 +117,15 @@ class UCL_H_Mat : public UCL_BaseMat {
    _kind=kind;
    _end=_array+rows*cols;
    return err;
-  }    
-  
+  }
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device container on the host is not supported 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device container on the host is not supported
+    * \param stride Number of _elements_ between the start of each row **/
  template <class ucl_type>
  inline void view(ucl_type &input, const size_t rows, const size_t cols,
                   const size_t stride) {
@ -149,45 +149,45 @@ class UCL_H_Mat : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device container on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device container on the host is not supported **/
  template <class ucl_type>
-  inline void view(ucl_type &input, const size_t rows, const size_t cols) 
+  inline void view(ucl_type &input, const size_t rows, const size_t cols)
    { view(input,rows,cols,input.row_size()); }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
    * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view 
-    * - Viewing a device container on the host is not supported **/ 
+    *   will be used for view
+    * - Viewing a device container on the host is not supported **/
  template <class ucl_type>
  inline void view(ucl_type &input, const size_t cols)
    { view(input,1,cols); }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
    * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view when using CUDA APIs 
-    * - Viewing a device container on the host is not supported **/ 
+    *   will be used for view when using CUDA APIs
+    * - Viewing a device container on the host is not supported **/
  template <class ucl_type>
-  inline void view(ucl_type &input) 
+  inline void view(ucl_type &input)
    { view(input,input.rows(),input.cols()); }
-  
+
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported
+    * \param stride Number of _elements_ between the start of each row **/
  template <class ptr_type>
  inline void view(ptr_type *input, const size_t rows, const size_t cols,
-                   const size_t stride, UCL_Device &dev) { 
+                   const size_t stride, UCL_Device &dev) {
    assert(rows==1 || stride==cols);
    clear();
    _kind=UCL_VIEW;
@ -197,40 +197,40 @@ class UCL_H_Mat : public UCL_BaseMat {
    this->_cq=dev.cq();
    _array=input;
    _end=_array+_cols;
-    
+
    #ifdef _OCL_MAT
    _host_view(*this,dev,_row_bytes*rows);
-    #endif 
+    #endif
  }

  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported **/
  template <class ptr_type>
  inline void view(ptr_type *input, const size_t rows, const size_t cols,
                   UCL_Device &dev) { view(input,rows,cols,cols,dev); }
-  
+
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported **/
  template <class ptr_type>
  inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
    { view(input,1,cols,dev); }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device container on the host is not supported 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device container on the host is not supported
+    * \param stride Number of _elements_ between the start of each row **/
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
-                          const size_t cols, const size_t stride) { 
+                          const size_t cols, const size_t stride) {
    assert(rows==1 || stride==cols);
    clear();
    _kind=UCL_VIEW;
@ -244,81 +244,81 @@ class UCL_H_Mat : public UCL_BaseMat {
    _host_view(*this,input,_row_bytes*_rows);
    #endif
  }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device container on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device container on the host is not supported **/
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
-                          const size_t cols) 
+                          const size_t cols)
    { view_offset(offset,input,rows,cols,input.row_size()); }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
    * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view 
-    * - Viewing a device container on the host is not supported **/ 
+    *   will be used for view
+    * - Viewing a device container on the host is not supported **/
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
    { view_offset(offset,input,1,cols); }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
    * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view 
-    * - Viewing a device container on the host is not supported **/ 
+    *   will be used for view
+    * - Viewing a device container on the host is not supported **/
  template <class ucl_type>
-  inline void view_offset(const size_t offset, ucl_type &input) { 
-    if (input.rows()==1) 
+  inline void view_offset(const size_t offset, ucl_type &input) {
+    if (input.rows()==1)
      view_offset(offset,input,1,input.cols()-offset);
-    else 
+    else
      view_offset(offset,input,input.rows()-offset/input.row_size(),
                  input.cols());
  }
-  
+
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
-    * - Viewing a device pointer on the host is not supported **/ 
+    *   allocating container
+    * - Viewing a device pointer on the host is not supported **/
  template <class ptr_type>
  inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
                          const size_t cols, UCL_Device &dev)
    { view(input+offset,rows,cols,dev); }
-  
+
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported
+    * \param stride Number of _elements_ between the start of each row **/
  template <class ptr_type>
  inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
-                          const size_t cols,const size_t stride,UCL_Device &dev) 
+                          const size_t cols,const size_t stride,UCL_Device &dev)
    { view(input+offset,rows,cols,stride,dev); }

  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported **/
  template <class ptr_type>
-  inline void view_offset(const size_t offset, ptr_type *input, 
+  inline void view_offset(const size_t offset, ptr_type *input,
                          const size_t cols, UCL_Device &dev)
    { view(input+offset,1,cols,dev); }
-  
+
  /// Free memory and set size to 0
-  inline void clear() 
-    { _host_free(*this); _cols=0; _kind=UCL_VIEW; } 
+  inline void clear()
+    { _host_free(*this); _cols=0; _kind=UCL_VIEW; }

  /// Resize the allocation to rows x cols elements
  /** \note Cannot be used on views **/
@ -333,7 +333,7 @@ class UCL_H_Mat : public UCL_BaseMat {
                << " bytes on host.\n";
      _row_bytes=0;
      UCL_GERYON_EXIT;
-      #endif 
+      #endif
      _row_bytes=0;
      return err;
    }
@ -347,7 +347,7 @@ class UCL_H_Mat : public UCL_BaseMat {
  /// Resize (only if bigger) the allocation to contain rows x cols elements
  /** \note Cannot be used on views **/
  inline int resize_ib(const int rows, const int cols)
-    { if (cols>_cols || rows>_rows) return resize(rows,cols); 
+    { if (cols>_cols || rows>_rows) return resize(rows,cols);
      else return UCL_SUCCESS; }

  /// Set each element to zero
@ -376,21 +376,21 @@ class UCL_H_Mat : public UCL_BaseMat {
  inline size_t row_bytes() const { return _row_bytes; }
  /// Get the size in bytes of 1 element
  inline int element_size() const { return sizeof(numtyp); }
-    
+
  /// Get element at index i
  inline numtyp & operator[](const int i) { return _array[i]; }
  /// Get element at index i
  inline const numtyp & operator[](const int i) const { return _array[i]; }
-  /// 2D access (row should always be 0) 
-  inline numtyp & operator()(const int row, const int col) 
+  /// 2D access (row should always be 0)
+  inline numtyp & operator()(const int row, const int col)
    { return _array[row*_cols+col]; }
-  /// 2D access (row should always be 0) 
+  /// 2D access (row should always be 0)
  inline const numtyp & operator()(const int row, const int col) const
    { return _array[row*_cols+col]; }
-  
+
  /// Returns pointer to memory pointer for allocation on host
  inline numtyp ** host_ptr() { return &_array; }
-  
+
  /// Return the offset (in elements) from begin() pointer where data starts
  /** \note Always 0 for host matrices and CUDA APIs **/
  inline size_t offset() const { return 0; }
@ -409,14 +409,14 @@ class UCL_H_Mat : public UCL_BaseMat {
  /// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
  inline const void ** cbegin() const { return (const void **)&_array; }
  #endif
-  
+
 private:
  numtyp *_array, *_end;
  size_t _row_bytes, _rows, _cols;

  #ifdef _OCL_MAT
  device_ptr _carray;
-  #endif  
+  #endif
 };

 #endif
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -37,21 +37,21 @@ class UCL_H_Vec : public UCL_BaseMat {
     ROW_MAJOR = 1,
     VECTOR = 1
   };
-   typedef numtyp data_type; 
-   
+   typedef numtyp data_type;
+
  UCL_H_Vec() : _cols(0) {
    #ifdef _OCL_MAT
    _carray=(cl_mem)(0);
    #endif
  }
  ~UCL_H_Vec() { _host_free(*this); }
-  
+
  /// Construct with n columns
  /** \sa alloc() **/
-  UCL_H_Vec(const size_t n, UCL_Device &device, 
-            const enum UCL_MEMOPT kind=UCL_READ_WRITE) 
+  UCL_H_Vec(const size_t n, UCL_Device &device,
+            const enum UCL_MEMOPT kind=UCL_READ_WRITE)
    { _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); }
-  
+
  /// Set up host vector with 'cols' columns and reserve memory
  /** The kind parameter controls memory pinning as follows:
    * - UCL_READ_WRITE - Specify that you will read and write from host
@ -84,7 +84,7 @@ class UCL_H_Vec : public UCL_BaseMat {
    _kind=kind;
    _end=_array+cols;
    return err;
-  }    
+  }

  /// Set up host vector with 'cols' columns and reserve memory
  /** The kind parameter controls memory pinning as follows:
@ -108,7 +108,7 @@ class UCL_H_Vec : public UCL_BaseMat {
                << " bytes on host.\n";
      _row_bytes=0;
      UCL_GERYON_EXIT;
-      #endif 
+      #endif
      _row_bytes=0;
      return err;
    }
@ -118,13 +118,13 @@ class UCL_H_Vec : public UCL_BaseMat {
    _end=_array+cols;
    return err;
  }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device container on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device container on the host is not supported **/
  template <class ucl_type>
  inline void view(ucl_type &input, const size_t rows, const size_t cols) {
    #ifdef UCL_DEBUG
@ -143,14 +143,14 @@ class UCL_H_Vec : public UCL_BaseMat {
    CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
    #endif
  }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
    *   allocating container when using CUDA APIs
-    * - Viewing a device container on the host is not supported 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    * - Viewing a device container on the host is not supported
+    * \param stride Number of _elements_ between the start of each row **/
  template <class ucl_type>
  inline void view(ucl_type &input, const size_t rows, const size_t cols,
                   const size_t stride) { view(input,rows,cols); }
@ -159,31 +159,31 @@ class UCL_H_Vec : public UCL_BaseMat {
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
    * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view 
-    * - Viewing a device container on the host is not supported **/ 
+    *   will be used for view
+    * - Viewing a device container on the host is not supported **/
  template <class ucl_type>
  inline void view(ucl_type &input, const size_t cols)
    { view(input,1,cols); }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container 
+    *   allocating container
    * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view 
-    * - Viewing a device container on the host is not supported **/ 
+    *   will be used for view
+    * - Viewing a device container on the host is not supported **/
  template <class ucl_type>
-  inline void view(ucl_type &input) 
+  inline void view(ucl_type &input)
    { view(input,input.rows()*input.row_size()); }
-  
+
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported **/
  template <class ptr_type>
  inline void view(ptr_type *input, const size_t rows, const size_t cols,
                   UCL_Device &dev) {
@ -197,38 +197,38 @@ class UCL_H_Vec : public UCL_BaseMat {
    this->_cq=dev.cq();
    _array=input;
    _end=_array+_cols;
-    
+
    #ifdef _OCL_MAT
    _host_view(*this,dev,_row_bytes);
-    #endif 
+    #endif
  }
-  
+
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
    *   allocating container when using CUDA APIs
-    * - Viewing a device pointer on the host is not supported 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    * - Viewing a device pointer on the host is not supported
+    * \param stride Number of _elements_ between the start of each row **/
  template <class ptr_type>
  inline void view(ptr_type *input, const size_t rows, const size_t cols,
-                   const size_t stride, UCL_Device &dev) 
+                   const size_t stride, UCL_Device &dev)
    { view(input,rows,cols,stride); }

  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
    *   allocating container when using CUDA APIs
-    * - Viewing a device pointer on the host is not supported **/ 
+    * - Viewing a device pointer on the host is not supported **/
  template <class ptr_type>
  inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
    { view(input,1,cols,dev); }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
    *   allocating container when using CUDA APIs
-    * - Viewing a device container on the host is not supported **/ 
+    * - Viewing a device container on the host is not supported **/
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
                          const size_t cols) {
@ -246,76 +246,76 @@ class UCL_H_Vec : public UCL_BaseMat {
    _host_view(*this,input,_row_bytes);
    #endif
  }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device container on the host is not supported 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device container on the host is not supported
+    * \param stride Number of _elements_ between the start of each row **/
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
-                          const size_t cols, const size_t stride) 
+                          const size_t cols, const size_t stride)
    { view_offset(offset,input,rows,cols); }

  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
    * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view 
-    * - Viewing a device container on the host is not supported **/ 
+    *   will be used for view
+    * - Viewing a device container on the host is not supported **/
  template <class ucl_type>
  inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
    { view_offset(offset,input,1,cols); }
-  
+
  /// Do not allocate memory, instead use an existing allocation from Geryon
  /** This function must be passed a Geryon vector or matrix container.
    * No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
+    *   allocating container when using CUDA APIs
    * - If a matrix is used a input, all elements (including padding)
-    *   will be used for view 
-    * - Viewing a device container on the host is not supported **/ 
+    *   will be used for view
+    * - Viewing a device container on the host is not supported **/
  template <class ucl_type>
-  inline void view_offset(const size_t offset, ucl_type &input) 
+  inline void view_offset(const size_t offset, ucl_type &input)
    { view_offset(offset,input,input.rows()*input.row_size()-offset); }
-  
+
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported **/
  template <class ptr_type>
  inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
                          const size_t cols, UCL_Device &dev)
    { view(input+offset,rows,cols,dev); }
-  
+
  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported 
-    * \param stride Number of _elements_ between the start of each row **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported
+    * \param stride Number of _elements_ between the start of each row **/
  template <class ptr_type>
  inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
-                          const size_t cols,const size_t stride,UCL_Device &dev) 
+                          const size_t cols,const size_t stride,UCL_Device &dev)
    { view(input+offset,rows,cols,stride,dev); }

  /// Do not allocate memory, instead use an existing allocation
  /** - No memory is freed when the object is destructed.
    * - The view does not prevent the memory from being freed by the
-    *   allocating container when using CUDA APIs 
-    * - Viewing a device pointer on the host is not supported **/ 
+    *   allocating container when using CUDA APIs
+    * - Viewing a device pointer on the host is not supported **/
  template <class ptr_type>
-  inline void view_offset(const size_t offset, ptr_type *input, 
+  inline void view_offset(const size_t offset, ptr_type *input,
                          const size_t cols, UCL_Device &dev)
    { view(input+offset,1,cols,dev); }
-  
+
  /// Free memory and set size to 0
-  inline void clear() 
+  inline void clear()
    { _host_free(*this); _kind=UCL_VIEW; _cols=0; }

  /// Resize the allocation to contain cols elements
@ -324,7 +324,7 @@ class UCL_H_Vec : public UCL_BaseMat {
    assert(_kind!=UCL_VIEW);
    _row_bytes=cols*sizeof(numtyp);
    int err=_host_resize(*this,_row_bytes);
-    
+
    if (err!=UCL_SUCCESS) {
      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " << _row_bytes
@ -340,7 +340,7 @@ class UCL_H_Vec : public UCL_BaseMat {
    _end=_array+cols;
    return err;
  }
-    
+
  /// Resize (only if bigger) the allocation to contain cols elements
  /** \note Cannot be used on views **/
  inline int resize_ib(const int cols)
@ -348,7 +348,7 @@ class UCL_H_Vec : public UCL_BaseMat {

  /// Set each element to zero
  inline void zero() { _host_zero(_array,row_bytes()); }
-  
+
  /// Set first n elements to zero
  inline void zero(const int n) { _host_zero(_array,n*sizeof(numtyp)); }

@ -373,35 +373,35 @@ class UCL_H_Vec : public UCL_BaseMat {
  inline size_t row_bytes() const { return _row_bytes; }
  /// Get the size in bytes of 1 element
  inline int element_size() const { return sizeof(numtyp); }
-    
+
  /// Get element at index i
  inline numtyp & operator[](const int i) { return _array[i]; }
  /// Get element at index i
  inline const numtyp & operator[](const int i) const { return _array[i]; }
-  /// 2D access (row should always be 0) 
-  inline numtyp & operator()(const int row, const int col) 
+  /// 2D access (row should always be 0)
+  inline numtyp & operator()(const int row, const int col)
    { return _array[col]; }
-  /// 2D access (row should always be 0) 
+  /// 2D access (row should always be 0)
  inline const numtyp & operator()(const int row, const int col) const
    { return _array[col]; }
-  
+
  /// Returns pointer to memory pointer for allocation on host
  inline numtyp ** host_ptr() { return &_array; }
-  
+
  /// Return the offset (in elements) from begin() pointer where data starts
  /** \note Always 0 for host matrices and CUDA APIs **/
  inline size_t offset() const { return 0; }
  /// Return the offset (in bytes) from begin() pointer where data starts
  /** \note Always 0 for host matrices and CUDA APIs **/
  inline size_t byteoff() const { return 0; }
-  
+
  #ifdef _OCL_MAT
  /// For OpenCL, returns a reference to the cl_mem object
  inline device_ptr & cbegin() { return _carray; }
  /// For OpenCL, returns a reference to the cl_mem object
  inline const device_ptr & cbegin() const { return _carray; }
  #endif
-  
+
 private:
  numtyp *_array, *_end;
  size_t _row_bytes, _cols;
--- a/lib/gpu/geryon/ucl_matrix.h
+++ b/lib/gpu/geryon/ucl_matrix.h
@ -34,25 +34,25 @@ class UCL_Matrix {
    ROW_MAJOR = 1,
    VECTOR = 0
  };
-  typedef hosttype data_type; 
+  typedef hosttype data_type;

  /// Host Allocation
  UCL_H_Mat<hosttype> host;
-  
+
  /// Device Allocation
  UCL_D_Mat<devtype> device;

  UCL_Matrix() { }
  ~UCL_Matrix() { }
-  
+
  /// Construct with specied number of rows and columns
  /** \sa alloc() **/
-  UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc, 
+  UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc,
             const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
             const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
-  
+
  /// Set up host matrix with specied # of rows/cols and reserve memory
  /** The kind1 parameter controls memory access from the host
    * - UCL_READ_WRITE - Specify that you will read and write from host
@ -74,7 +74,7 @@ class UCL_Matrix {
                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
    { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); }
-  
+
  /// Set up host matrix with specied # of rows/cols and reserve memory
  /** The kind1 parameter controls memory access from the host
    * - UCL_READ_WRITE - Specify that you will read and write from host
@ -92,9 +92,9 @@ class UCL_Matrix {
                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
    { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
-  
+
  /// Free memory and set size to 0
-  inline void clear() 
+  inline void clear()
    { host.clear(); device.clear(); }

  /// Resize the allocation to contain cols elements
@ -106,10 +106,10 @@ class UCL_Matrix {
    return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
      dev_resize(device,host,_buffer,rows,cols);
  }
-    
+
  /// Resize (only if bigger) the allocation to contain cols elements
  inline int resize_ib(const int new_rows, const int new_cols)
-    { if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols); 
+    { if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols);
      else return UCL_SUCCESS; }

  /// Set each element to zero (asynchronously on device)
@ -118,14 +118,14 @@ class UCL_Matrix {
  inline void zero(const int n) { zero(n,cq()); }
  /// Set each element to zero (asynchronously on device)
  inline void zero(command_queue &cq) {
-    host.zero(); 
+    host.zero();
    if (device.kind()!=UCL_VIEW) device.zero(cq);
    else if (_buffer.numel()>0) _buffer.zero();
  }
  /// Set first n elements to zero (asynchronously on device)
-  inline void zero(const int n, command_queue &cq) { 
-    host.zero(n); 
-    if (device.kind()!=UCL_VIEW) device.zero(n,cq); 
+  inline void zero(const int n, command_queue &cq) {
+    host.zero(n);
+    if (device.kind()!=UCL_VIEW) device.zero(n,cq);
    else if (_buffer.numel()>0) _buffer.zero();
  }

@ -136,26 +136,26 @@ class UCL_Matrix {
  /// Get the number of columns
  inline size_t cols() const { return host.cols(); }
  /// Get the memory usage (bytes) of the s-object (including any buffers)
-  inline size_t host_mem_usage() 
+  inline size_t host_mem_usage()
    { return host.row_bytes()*host.rows()+_buffer.row_bytes()*_buffer.rows(); }
  /// Get the memory usage (bytes) of the s-object (including any buffers)
-  inline size_t device_mem_usage() 
+  inline size_t device_mem_usage()
    { return device.row_bytes()*device.rows(); }
-    
+
  /// Get element at index i
  inline hosttype & operator[](const int i) { return host[i]; }
  /// Get element at index i
  inline const hosttype & operator[](const int i) const { return host[i]; }
-  /// 2D access (row should always be 0) 
-  inline hosttype & operator()(const int row, const int col) 
+  /// 2D access (row should always be 0)
+  inline hosttype & operator()(const int row, const int col)
    { return host(row,col); }
-  /// 2D access (row should always be 0) 
+  /// 2D access (row should always be 0)
  inline const hosttype & operator()(const int row, const int col) const
    { return host(row,col); }
-  
+
  /// Returns pointer to memory pointer for allocation on host
  inline hosttype ** host_ptr() { return host.host_ptr(); }
-  
+
  /// Return the default command queue/stream associated with this data
  inline command_queue & cq() { return host.cq(); }
  /// Change the default command queue associated with this data
@ -172,7 +172,7 @@ class UCL_Matrix {


  /// Update the allocation on the host asynchronously
-  inline void update_host() 
+  inline void update_host()
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(host,device,_buffer,true); }
  /// Update the allocation on the host (true for asynchronous copy)
@ -202,7 +202,7 @@ class UCL_Matrix {


  /// Update the allocation on the device asynchronously
-  inline void update_device() 
+  inline void update_device()
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(device,host,_buffer,true); }
  /// Update the allocation on the device (true for asynchronous copy)
--- a/lib/gpu/geryon/ucl_nv_kernel.h
+++ b/lib/gpu/geryon/ucl_nv_kernel.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -53,9 +53,9 @@ typedef struct _double4 double4;
 #define BLOCK_SIZE_Y blockDim.y
 #define __kernel extern "C" __global__
 #define __local __shared__
-#define __global  
+#define __global
 #define atom_add atomicAdd
-#define ucl_inline static __inline__ __device__ 
+#define ucl_inline static __inline__ __device__

 #endif

--- a/lib/gpu/geryon/ucl_print.h
+++ b/lib/gpu/geryon/ucl_print.h
@ -17,10 +17,10 @@
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */
-   
+
 // Only allow this file to be included by nvc_memory.h and ocl_memory.h
 #ifdef UCL_PRINT_ALLOW

@ -40,7 +40,7 @@ template <> struct _ucl_print<1> {
  }
  template <class mat_type>
  static inline void p(mat_type &mat, const size_t rows, const size_t cols,
-                       std::ostream &out, const std::string delim, 
+                       std::ostream &out, const std::string delim,
                       const std::string row_delim) {
    int offset=0;
    int row_size=cols;
@ -58,12 +58,12 @@ template <> struct _ucl_print<1> {
  }
  template <class mat_type>
  static inline void p(const mat_type &mat,const size_t rows,const size_t cols,
-                       std::ostream &out,const std::string delim, 
+                       std::ostream &out,const std::string delim,
                       const std::string row_delim, UCL_Device &dev) {
-    p(mat,rows,cols,out,delim,row_delim);                       
+    p(mat,rows,cols,out,delim,row_delim);
  }
 };
-      
+
 template <int mem> struct _ucl_print {
  template <class mat_type>
  static inline void p(mat_type &mat, const size_t n, std::ostream &out,
@ -83,7 +83,7 @@ template <int mem> struct _ucl_print {
  }
  template <class mat_type>
  static inline void p(mat_type &mat, const size_t rows, const size_t cols,
-                       std::ostream &out, const std::string delim, 
+                       std::ostream &out, const std::string delim,
                       const std::string row_delim) {
    UCL_H_Vec<typename mat_type::data_type> temp;
    temp.alloc(mat.rows()*mat.cols(),mat);
@ -91,12 +91,12 @@ template <int mem> struct _ucl_print {
      ucl_copy(temp,mat,rows*cols,false);
    else
      ucl_copy(temp,mat,rows,cols,false);
-    _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);      
+    _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);
  }
  template <class mat_type>
-  static inline void p(const mat_type &mat, const size_t rows, 
+  static inline void p(const mat_type &mat, const size_t rows,
                       const size_t cols,std::ostream &out,
-                       const std::string delim, 
+                       const std::string delim,
                       const std::string row_delim, UCL_Device &dev) {
    UCL_H_Vec<typename mat_type::data_type> temp;
    temp.alloc(mat.rows()*mat.cols(),dev);
@ -104,9 +104,9 @@ template <int mem> struct _ucl_print {
      ucl_copy(temp,mat,rows*cols,false);
    else
      ucl_copy(temp,mat,rows,cols,false);
-    _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);      
+    _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);
  }
-};                   
+};

 // -------------------------------------------------------------------------
 // - Non-const routines that do not require a device object
@ -123,13 +123,13 @@ inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out,
  }
  _ucl_print<mat_type::MEM_TYPE>::p(mat,n,out,delim);
 }
-  
+
 /// Outputs n elements of mat delimited by a space
 template <class mat_type>
 inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out) {
  ucl_print(mat,n,out," ");
 }
-  
+
 /// Outputs n elements of mat delimited by a space to standard out
 template <class mat_type>
 inline void ucl_print(mat_type &mat, const size_t n) {
@ -139,8 +139,8 @@ inline void ucl_print(mat_type &mat, const size_t n) {
 /// Outputs upper left rows and cols of mat delimited by the string delim
 template <class mat_type>
 inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
-                      std::ostream &out, const std::string delim, 
-                      const std::string row_delim) {                      
+                      std::ostream &out, const std::string delim,
+                      const std::string row_delim) {
  if (rows*cols>mat.numel()) {
    std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix "
              << "that only has " << mat.numel() << " elements.";
@ -148,17 +148,17 @@ inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
  }
  _ucl_print<mat_type::MEM_TYPE>::p(mat,rows,cols,out,delim,row_delim);
 }
-  
+
 /// Outputs upper left rows and cols of mat delimited by a space
 template <class mat_type>
 inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
                      std::ostream &out) {
  ucl_print(mat,rows,cols,out," ","\n");
 }
-  
+
 /// Outputs  upper left rows and cols of mat delimited by a space to std out
 template <class mat_type>
-inline void ucl_print(mat_type &mat, const size_t rows, 
+inline void ucl_print(mat_type &mat, const size_t rows,
                      const size_t cols) {
  ucl_print(mat,rows,cols,std::cout," ","\n");
 }
@ -177,7 +177,7 @@ inline void ucl_print(mat_type &mat, std::ostream &out) {
  else
    ucl_print(mat,mat.rows(),mat.cols(),out," ","\n");
 }
-  
+
 // -------------------------------------------------------------------------
 // - Const routines that do not require a device object
 // -------------------------------------------------------------------------
@ -193,14 +193,14 @@ inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out,
  }
  _ucl_print<mat_type::MEM_TYPE>::p(mat,n,out,delim,dev);
 }
-  
+
 /// Outputs n elements of mat delimited by a space
 template <class mat_type>
 inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out,
                      UCL_Device &dev) {
  ucl_print(mat,n,out," ",dev);
 }
-  
+
 /// Outputs n elements of mat delimited by a space to standard out
 template <class mat_type>
 inline void ucl_print(const mat_type &mat, const size_t n,
@ -211,7 +211,7 @@ inline void ucl_print(const mat_type &mat, const size_t n,
 /// Outputs upper left rows and cols of mat delimited by the string delim
 template <class mat_type>
 inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
-                      std::ostream &out, const std::string delim, 
+                      std::ostream &out, const std::string delim,
                      const std::string row_delim, UCL_Device &dev) {
  if (rows*cols>mat.numel()) {
    std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix "
@ -220,17 +220,17 @@ inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
  }
  _ucl_print<mat_type::MEM_TYPE>::p(mat,rows,cols,out,delim,row_delim,dev);
 }
-  
+
 /// Outputs upper left rows and cols of mat delimited by a space
 template <class mat_type>
 inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
                      std::ostream &out, UCL_Device &dev) {
  ucl_print(mat,rows,cols,out," ","\n",dev);
 }
-  
+
 /// Outputs  upper left rows and cols of mat delimited by a space to std out
 template <class mat_type>
-inline void ucl_print(const mat_type &mat, const size_t rows, 
+inline void ucl_print(const mat_type &mat, const size_t rows,
                      const size_t cols, UCL_Device &dev) {
  ucl_print(mat,rows,cols,std::cout," ","\n",dev);
 }
@ -256,27 +256,27 @@ inline void ucl_print(const mat_type &mat, std::ostream &out, UCL_Device &dev) {

 template <class numtyp>
 inline std::ostream & operator << (std::ostream &out, UCL_H_Vec<numtyp> &mat)
-  { ucl_print(mat,out); return out; } 
+  { ucl_print(mat,out); return out; }

 template <class numtyp>
 inline std::ostream & operator << (std::ostream &out, UCL_H_Mat<numtyp> &mat)
-  { ucl_print(mat,out); return out; } 
+  { ucl_print(mat,out); return out; }

 template <class numtyp>
 inline std::ostream & operator << (std::ostream &out, UCL_D_Vec<numtyp> &mat)
-  { ucl_print(mat,out); return out; } 
+  { ucl_print(mat,out); return out; }

 template <class numtyp>
 inline std::ostream & operator << (std::ostream &out, UCL_D_Mat<numtyp> &mat)
-  { ucl_print(mat,out); return out; } 
+  { ucl_print(mat,out); return out; }


 template <class t1, class t2>
 inline std::ostream & operator << (std::ostream &out, UCL_Vector<t1,t2> &mat)
-  { ucl_print(mat.host,out); return out; } 
+  { ucl_print(mat.host,out); return out; }

 template <class t1, class t2>
 inline std::ostream & operator << (std::ostream &out, UCL_Matrix<t1,t2> &mat)
-  { ucl_print(mat.host,out); return out; } 
+  { ucl_print(mat.host,out); return out; }

 #endif
--- a/lib/gpu/geryon/ucl_s_obj_help.h
+++ b/lib/gpu/geryon/ucl_s_obj_help.h
@ -3,7 +3,7 @@
                             -------------------
                               W. Michael Brown

-  Helper routines for allocating memory for s-objects and performing 
+  Helper routines for allocating memory for s-objects and performing
  host/device updates. (Different routines depending on whether the
  same type is used on the host and device).

@ -141,29 +141,29 @@ template <> struct _ucl_s_obj_help<1> {
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, 
+  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
                          const bool async) {
    ucl_copy(dst,src,cols,async);
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, 
+  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
                          command_queue &cq) {
    ucl_copy(dst,src,cols,cq);
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, 
+  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
                          t3 &buffer, const bool async) {
    ucl_copy(dst,src,rows,cols,async);
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, 
+  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
                          t3 &buffer, command_queue &cq) {
    ucl_copy(dst,src,rows,cols,cq);
  }
-  
+
  template <class t1, class t2, class t3>
  static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
    if (device.kind()==UCL_VIEW) {
@ -181,7 +181,7 @@ template <> struct _ucl_s_obj_help<1> {
  }

  template <class t1, class t2, class t3>
-  static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows, 
+  static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows,
                               const int cols) {
    if (device.kind()==UCL_VIEW) {
      device.view(host);
@ -255,7 +255,7 @@ template <int st> struct _ucl_s_obj_help {
      e1=_buffer.alloc(cols,cq,kind1);
      if (e1!=UCL_SUCCESS)
        return e1;
-      return device.alloc(cols,cq,kind2); 
+      return device.alloc(cols,cq,kind2);
    }
  }

@ -314,7 +314,7 @@ template <int st> struct _ucl_s_obj_help {
      e1=_buffer.alloc(rows,cols,cq,kind1);
      if (e1!=UCL_SUCCESS)
        return e1;
-      return device.alloc(rows,cols,cq,kind2); 
+      return device.alloc(rows,cols,cq,kind2);
    }
  }

@ -329,25 +329,25 @@ template <int st> struct _ucl_s_obj_help {
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, 
+  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
                          const bool async) {
    ucl_cast_copy(dst,src,cols,buffer,async);
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, 
+  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
                          command_queue &cq) {
    ucl_cast_copy(dst,src,cols,buffer,cq);
  }
-  
+
  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, 
+  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
                          t3 &buffer, const bool async) {
    ucl_cast_copy(dst,src,rows,cols,buffer,async);
  }

  template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, 
+  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
                          t3 &buffer, command_queue &cq) {
    ucl_cast_copy(dst,src,rows,cols,buffer,cq);
  }
@ -373,7 +373,7 @@ template <int st> struct _ucl_s_obj_help {
  }

  template <class t1, class t2, class t3>
-  static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows, 
+  static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows,
                               const int cols) {
    int err=buff.resize(rows,cols);
    if (err!=UCL_SUCCESS)
--- a/lib/gpu/geryon/ucl_types.h
+++ b/lib/gpu/geryon/ucl_types.h
@ -17,7 +17,7 @@
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
+   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

@ -26,65 +26,65 @@

 // Assign an integer id based on the data type: (int, float, double, etc)
 template <class eltype> struct _UCL_DATA_ID;
-template <> struct _UCL_DATA_ID<double> { 
+template <> struct _UCL_DATA_ID<double> {
  enum { id=1 };
-  static inline const char * name() { return "double"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=double"; }  
+  static inline const char * name() { return "double"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=double"; }
 };
-template <> struct _UCL_DATA_ID<float> { 
+template <> struct _UCL_DATA_ID<float> {
  enum { id=2 };
-  static inline const char * name() { return "float"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=float"; }  
+  static inline const char * name() { return "float"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=float"; }
 };
-template <> struct _UCL_DATA_ID<unsigned> { 
+template <> struct _UCL_DATA_ID<unsigned> {
  enum { id=3 };
-  static inline const char * name() { return "unsigned"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned"; }  
+  static inline const char * name() { return "unsigned"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned"; }
 };
-template <> struct _UCL_DATA_ID<int> { 
+template <> struct _UCL_DATA_ID<int> {
  enum { id=4 };
-  static inline const char * name() { return "int"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=int"; }  
+  static inline const char * name() { return "int"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=int"; }
 };
-template <> struct _UCL_DATA_ID<char> { 
+template <> struct _UCL_DATA_ID<char> {
  enum { id=5 };
-  static inline const char * name() { return "char"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=char"; }  
+  static inline const char * name() { return "char"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=char"; }
 };
-template <> struct _UCL_DATA_ID<unsigned char> { 
+template <> struct _UCL_DATA_ID<unsigned char> {
  enum { id=6 };
-  static inline const char * name() { return "unsigned char"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned char"; }  
+  static inline const char * name() { return "unsigned char"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned char"; }
 };
-template <> struct _UCL_DATA_ID<short> { 
+template <> struct _UCL_DATA_ID<short> {
  enum { id=7 };
-  static inline const char * name() { return "short"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=short"; }  
+  static inline const char * name() { return "short"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=short"; }
 };
-template <> struct _UCL_DATA_ID<unsigned short> { 
+template <> struct _UCL_DATA_ID<unsigned short> {
  enum { id=8 };
-  static inline const char * name() { return "unsigned short"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned short"; }  
+  static inline const char * name() { return "unsigned short"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned short"; }
 };
-template <> struct _UCL_DATA_ID<long> { 
+template <> struct _UCL_DATA_ID<long> {
  enum { id=9 };
-  static inline const char * name() { return "long"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=long"; }  
+  static inline const char * name() { return "long"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=long"; }
 };
-template <> struct _UCL_DATA_ID<unsigned long> { 
+template <> struct _UCL_DATA_ID<unsigned long> {
  enum { id=10 };
-  static inline const char * name() { return "unsigned long"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned long"; }  
+  static inline const char * name() { return "unsigned long"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned long"; }
 };
-template <> struct _UCL_DATA_ID<long double> { 
+template <> struct _UCL_DATA_ID<long double> {
  enum { id=11 };
-  static inline const char * name() { return "long double"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=long double"; }  
+  static inline const char * name() { return "long double"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=long double"; }
 };
-template <class eltype> struct _UCL_DATA_ID { 
+template <class eltype> struct _UCL_DATA_ID {
  enum { id=0 };
-  static inline const char * name() { return "error_type"; }  
-  static inline const char * numtyp_flag() { return "-D NUMTYP=error_type"; }  
+  static inline const char * name() { return "error_type"; }
+  static inline const char * numtyp_flag() { return "-D NUMTYP=error_type"; }
 };

 // Host memory allocation types
@ -97,7 +97,7 @@ enum UCL_MEMOPT {
  UCL_NOT_SPECIFIED
 };

-enum UCL_DEVICE_TYPE { 
+enum UCL_DEVICE_TYPE {
  UCL_DEFAULT,        ///< Unknown device type
  UCL_CPU,            ///< Device is a CPU
  UCL_GPU,            ///< Device is a GPU
@ -111,7 +111,7 @@ enum UCL_ERROR_FLAG {
  UCL_FUNCTION_NOT_FOUND, ///< Kernel function not found
  UCL_COMPILE_ERROR,      ///< Error compiling kernel
  UCL_MEMORY_ERROR
-};  
+};

 template <class numtyp>
 const char * ucl_template_name() { return _UCL_DATA_ID<numtyp>::name(); }
--- a/lib/gpu/geryon/ucl_vector.h
+++ b/lib/gpu/geryon/ucl_vector.h
@ -34,25 +34,25 @@ class UCL_Vector {
    ROW_MAJOR = 1,
    VECTOR = 1
  };
-  typedef hosttype data_type; 
+  typedef hosttype data_type;

  /// Host Allocation
  UCL_H_Vec<hosttype> host;
-  
+
  /// Device Allocation
  UCL_D_Vec<devtype> device;
-  
+
  UCL_Vector() { }
  ~UCL_Vector() { }

  /// Construct with n columns
  /** \sa alloc() **/
-  UCL_Vector(const size_t cols, UCL_Device &acc, 
+  UCL_Vector(const size_t cols, UCL_Device &acc,
             const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
             const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,cols,acc,kind1,kind2); }
-  
+
  /// Set up the vector with 'cols' columns and reserve memory
  /** The kind1 parameter controls memory access from the host
    * - UCL_READ_WRITE - Specify that you will read and write from host
@ -89,12 +89,12 @@ class UCL_Vector {
    * \return UCL_SUCCESS if the memory allocation is successful **/
  inline int alloc(const size_t cols, UCL_Device &acc,
                   const enum UCL_MEMOPT kind1=UCL_READ_WRITE,
-                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE) 
+                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
    { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,cols,acc,kind1,kind2); }
-  
+
  /// Free memory and set size to 0
-  inline void clear() 
+  inline void clear()
    { host.clear(); device.clear(); }

  /// Resize the allocation to contain cols elements
@ -106,7 +106,7 @@ class UCL_Vector {
    return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
      dev_resize(device,host,_buffer,cols);
  }
-    
+
  /// Resize (only if bigger) the allocation to contain cols elements
  inline int resize_ib(const int new_cols)
    { if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; }
@ -117,14 +117,14 @@ class UCL_Vector {
  inline void zero(const int n) { zero(n,cq()); }
  /// Set each element to zero (asynchronously on device)
  inline void zero(command_queue &cq) {
-    host.zero(); 
+    host.zero();
    if (device.kind()!=UCL_VIEW) device.zero(cq);
    else if (_buffer.numel()>0) _buffer.zero();
  }
  /// Set first n elements to zero (asynchronously on device)
-  inline void zero(const int n, command_queue &cq) { 
-    host.zero(n); 
-    if (device.kind()!=UCL_VIEW) device.zero(n,cq); 
+  inline void zero(const int n, command_queue &cq) {
+    host.zero(n);
+    if (device.kind()!=UCL_VIEW) device.zero(n,cq);
    else if (_buffer.numel()>0) _buffer.zero();
  }

@ -135,27 +135,27 @@ class UCL_Vector {
  /// Get the number of columns
  inline size_t cols() const { return host.cols(); }
  /// Get the memory usage (bytes) of the s-object (including any buffers)
-  inline size_t host_mem_usage() 
+  inline size_t host_mem_usage()
    { return host.row_bytes()+_buffer.row_bytes(); }
  /// Get the memory usage (bytes) of the s-object (including any buffers)
-  inline size_t device_mem_usage() 
+  inline size_t device_mem_usage()
    { return device.row_bytes(); }
-  
-  
+
+
  /// Get element at index i
  inline hosttype & operator[](const int i) { return host[i]; }
  /// Get element at index i
  inline const hosttype & operator[](const int i) const { return host[i]; }
-  /// 2D access (row should always be 0) 
-  inline hosttype & operator()(const int row, const int col) 
+  /// 2D access (row should always be 0)
+  inline hosttype & operator()(const int row, const int col)
    { return host[col]; }
-  /// 2D access (row should always be 0) 
+  /// 2D access (row should always be 0)
  inline const hosttype & operator()(const int row, const int col) const
    { return host[col]; }
-  
+
  /// Returns pointer to memory pointer for allocation on host
  inline hosttype ** host_ptr() { return host.host_ptr(); }
-  
+
  /// Return the default command queue/stream associated with this data
  inline command_queue & cq() { return host.cq(); }
  /// Change the default command queue associated with this data
@ -172,7 +172,7 @@ class UCL_Vector {


  /// Update the allocation on the host asynchronously
-  inline void update_host() 
+  inline void update_host()
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(host,device,_buffer,true); }
  /// Update the allocation on the host (true for asynchronous copy)
@ -202,7 +202,7 @@ class UCL_Vector {


  /// Update the allocation on the device asynchronously
-  inline void update_device() 
+  inline void update_device()
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(device,host,_buffer,true); }
  /// Update the allocation on the device (true for asynchronous copy)
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -24,7 +24,7 @@ AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false),
 }

 template <class numtyp, class acctyp>
-int AnswerT::bytes_per_atom() const { 
+int AnswerT::bytes_per_atom() const {
  int bytes=11*sizeof(acctyp);
  if (_rot)
    bytes+=4*sizeof(acctyp);
@ -38,19 +38,19 @@ bool AnswerT::alloc(const int inum) {
  _max_local=static_cast<int>(static_cast<double>(inum)*1.10);

  bool success=true;
-  
+
  _ans_fields=4;
  if (_rot)
    _ans_fields+=4;
-  
+
  // ---------------------------  Device allocations
  success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
                                 UCL_READ_WRITE)==UCL_SUCCESS);
  success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_READ_ONLY,
                                UCL_READ_WRITE)==UCL_SUCCESS);
  _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes();
-  
-  _allocated=true;  
+
+  _allocated=true;
  return success;
 }

@ -69,21 +69,21 @@ bool AnswerT::init(const int inum, const bool charge, const bool rot,
  if (_charge)
    _e_fields++;
  _ev_fields=6+_e_fields;
-    
+
  // Initialize atom and nbor data
  int ef_inum=inum;
  if (ef_inum==0)
    ef_inum=1000;
-  
+
  // Initialize timers for the selected device
  time_answer.init(*dev);
  time_answer.zero();
  _time_cast=0.0;
  _time_cpu_idle=0.0;
-  
+
  return success && alloc(ef_inum);
 }
-  
+
 template <class numtyp, class acctyp>
 bool AnswerT::add_fields(const bool charge, const bool rot) {
  bool realloc=false;
@ -127,15 +127,15 @@ void AnswerT::clear() {
 template <class numtyp, class acctyp>
 double AnswerT::host_memory_usage() const {
  int atom_bytes=4;
-  if (_charge) 
+  if (_charge)
    atom_bytes+=1;
-  if (_rot) 
+  if (_rot)
    atom_bytes+=4;
  int ans_bytes=atom_bytes+_ev_fields;
  return ans_bytes*(_max_local)*sizeof(acctyp)+
         sizeof(Answer<numtyp,acctyp>);
 }
-  
+
 template <class numtyp, class acctyp>
 void AnswerT::copy_answers(const bool eflag, const bool vflag,
                               const bool ef_atom, const bool vf_atom) {
@ -144,8 +144,8 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
  _vflag=vflag;
  _ef_atom=ef_atom;
  _vf_atom=vf_atom;
-    
-  int csize=_ev_fields;    
+
+  int csize=_ev_fields;
  if (!eflag)
    csize-=_e_fields;
  if (!vflag)
@ -180,7 +180,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
    for (int i=0; i<_inum; i++)
      evdwl+=engv[i];
    if (_ef_atom)
-      if (_ilist==NULL) 
+      if (_ilist==NULL)
        for (int i=0; i<_inum; i++)
          eatom[i]+=engv[i];
      else
@ -196,18 +196,18 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
      if (_vf_atom)
        if (_ilist==NULL) {
          int ii=0;
-          for (int i=vstart; i<iend; i++) 
+          for (int i=vstart; i<iend; i++)
            vatom[ii++][j]+=engv[i];
        } else {
          int ii=0;
-          for (int i=vstart; i<iend; i++) 
+          for (int i=vstart; i<iend; i++)
            vatom[_ilist[ii++]][j]+=engv[i];
        }
      vstart+=_inum;
      iend+=_inum;
    }
  }
-  
+
  return evdwl;
 }

@ -242,8 +242,8 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
      }
    vstart=iend;
    iend+=_inum;
-  } 
-  if (_vflag) { 
+  }
+  if (_vflag) {
    for (int j=0; j<6; j++) {
      for (int i=vstart; i<iend; i++)
        virial[j]+=engv[i];
@ -254,12 +254,12 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
        } else {
          for (int i=vstart, ii=0; i<iend; i++)
            vatom[_ilist[ii++]][j]+=engv[i];
-        }  
+        }
      vstart+=_inum;
      iend+=_inum;
    }
  }
-  
+
  return evdwl;
 }

--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -30,7 +30,7 @@ AtomT::Atom() : _compiled(false),_allocated(false),
 }

 template <class numtyp, class acctyp>
-int AtomT::bytes_per_atom() const { 
+int AtomT::bytes_per_atom() const {
  int id_space=0;
  if (_gpu_nbor==1)
    id_space=2;
@ -51,7 +51,7 @@ bool AtomT::alloc(const int nall) {
  _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);

  bool success=true;
-  
+
  // Ignore host/device transfers?
  _host_view=false;
  if (dev->shared_memory() && sizeof(numtyp)==sizeof(double)) {
@ -60,11 +60,11 @@ bool AtomT::alloc(const int nall) {
    assert(0==1);
    #endif
  }
-      
+
  // Allocate storage for CUDPP sort
  #ifdef USE_CUDPP
  if (_gpu_nbor==1) {
-    CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);  
+    CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);
    if (CUDPP_SUCCESS != result)
      return false;
  }
@ -110,7 +110,7 @@ bool AtomT::alloc(const int nall) {
    } else {
      success=success && (host_particle_id.alloc(_max_atoms,*dev,
                                                 UCL_WRITE_ONLY)==UCL_SUCCESS);
-      success=success && 
+      success=success &&
             (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
    }
    if (_gpu_nbor==2 && _host_view)
@ -124,8 +124,8 @@ bool AtomT::alloc(const int nall) {
  gpu_bytes+=x.device.row_bytes();
  if (gpu_bytes>_max_gpu_bytes)
    _max_gpu_bytes=gpu_bytes;
-  
-  _allocated=true;  
+
+  _allocated=true;
  return success;
 }

@ -135,7 +135,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
  bool success=true;
  // Ignore host/device transfers?
  int gpu_bytes=0;
-  
+
  if (charge && _charge==false) {
    _charge=true;
    _other=true;
@ -179,7 +179,7 @@ bool AtomT::add_fields(const bool charge, const bool rot,
    _gpu_nbor=gpu_nbor;
    #ifdef USE_CUDPP
    if (_gpu_nbor==1) {
-      CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);  
+      CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0);
      if (CUDPP_SUCCESS != result)
        return false;
    }
@ -198,9 +198,9 @@ bool AtomT::add_fields(const bool charge, const bool rot,
    } else {
      success=success && (host_particle_id.alloc(_max_atoms,*dev,
                                                 UCL_WRITE_ONLY)==UCL_SUCCESS);
-      success=success && 
+      success=success &&
             (host_cell_id.alloc(_max_atoms,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
-    }             
+    }
  }

  return success;
@ -230,7 +230,7 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
  int ef_nall=nall;
  if (ef_nall==0)
    ef_nall=2000;
-  
+
  // Initialize timers for the selected device
  time_pos.init(*dev);
  time_q.init(*dev);
@ -241,14 +241,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
  time_quat.zero();
  time_vel.zero();
  _time_cast=0.0;
-  
+
  #ifdef GPU_CAST
  compile_kernels(*dev);
  #endif
-  
+
  return success && alloc(ef_nall);
 }
-  
+
 template <class numtyp, class acctyp>
 void AtomT::clear_resize() {
  if (!_allocated)
@ -274,7 +274,7 @@ void AtomT::clear_resize() {
  #ifdef USE_CUDPP
  if (_gpu_nbor==1) cudppDestroyPlan(sort_plan);
  #endif
-  
+
  if (_gpu_nbor==2) {
    host_particle_id.clear();
    host_cell_id.clear();
@ -305,21 +305,21 @@ void AtomT::clear() {
 template <class numtyp, class acctyp>
 double AtomT::host_memory_usage() const {
  int atom_bytes=4;
-  if (_charge) 
+  if (_charge)
    atom_bytes+=1;
-  if (_rot) 
+  if (_rot)
    atom_bytes+=4;
-  if (_vel) 
+  if (_vel)
    atom_bytes+=4;
  return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
 }
-  
+
 // Sort arrays for neighbor list calculation
 template <class numtyp, class acctyp>
 void AtomT::sort_neighbor(const int num_atoms) {
  #ifdef USE_CUDPP
-  CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), 
-                                 (int *)dev_particle_id.begin(), 
+  CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(),
+                                 (int *)dev_particle_id.begin(),
                                 8*sizeof(unsigned), num_atoms);
  if (CUDPP_SUCCESS != result) {
    printf("Error in cudppSort\n");
--- a/lib/gpu/lal_atom.cu
+++ b/lib/gpu/lal_atom.cu
@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

@ -17,9 +17,9 @@
 #include "lal_preprocessor.h"
 #endif

-__kernel void kernel_cast_x(__global numtyp4 *restrict x_type, 
+__kernel void kernel_cast_x(__global numtyp4 *restrict x_type,
                            const __global double *restrict x,
-                            const __global int *restrict type, 
+                            const __global int *restrict type,
                            const int nall) {
  int ii=GLOBAL_ID_X;

--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -57,19 +57,19 @@ class Atom {

  /// Set number of local+ghost atoms for future copy operations
  inline void nall(const int n) { _nall=n; }
-  
+
  /// Memory usage per atom in this class
-  int bytes_per_atom() const; 
+  int bytes_per_atom() const;

  /// Clear any previous data and set up for a new LAMMPS run
  /** \param rot True if atom storage needs quaternions
    * \param gpu_nbor 0 if neighboring will be performed on host
    *        gpu_nbor 1 if neighboring will be performed on device
    *        gpu_nbor 2 if binning on host and neighboring on device **/
-  bool init(const int nall, const bool charge, const bool rot, 
-            UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false, 
+  bool init(const int nall, const bool charge, const bool rot,
+            UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
            const bool vel=false);
-  
+
  /// Check if we have enough device storage and realloc if not
  /** Returns true if resized with any call during this timestep **/
  inline bool resize(const int nall, bool &success) {
@ -81,7 +81,7 @@ class Atom {
    }
    return _resized;
  }
-  
+
  /// If already initialized by another LAMMPS style, add fields as necessary
  /** \param rot True if atom storage needs quaternions
    * \param gpu_nbor 0 if neighboring will be performed on host
@ -89,28 +89,28 @@ class Atom {
    *        gpu_nbor 2 if binning on host and neighboring on device **/
  bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
                  const bool bonds, const bool vel=false);
-  
+
  /// Returns true if GPU is using charges
  bool charge() { return _charge; }
-  
+
  /// Returns true if GPU is using quaternions
  bool quaternion() { return _rot; }
-  
+
  /// Returns true if GPU is using velocities
  bool velocity() { return _vel; }

  /// Only free matrices of length inum or nall for resizing
  void clear_resize();
-  
+
  /// Free all memory on host and device
  void clear();
- 
+
  /// Return the total amount of host memory used by class in bytes
  double host_memory_usage() const;

  /// Sort arrays for neighbor list calculation on device
  void sort_neighbor(const int num_atoms);
-  
+
  /// Add copy times to timers
  inline void acc_timers() {
    time_pos.add_to_total();
@ -150,18 +150,18 @@ class Atom {
      total+=time_vel.total_seconds();
      time_vel.zero_total();
    }
-    
+
    return total+_time_transfer/1000.0;
  }
-  
+
  /// Return the total time for data cast/pack
  /** Zeros the time so that atom times are only included once **/
-  inline double cast_time() 
+  inline double cast_time()
    { double t=_time_cast; _time_cast=0.0; return t; }

  /// Pack LAMMPS atom type constants into matrix and copy to device
  template <class dev_typ, class t1>
-  inline void type_pack1(const int n, const int m_size, 
+  inline void type_pack1(const int n, const int m_size,
                         UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer,
                         t1 **one) {
    int ii=0;
@ -215,7 +215,7 @@ class Atom {
    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
    ucl_copy(dev_v,view,false);
  }
-  
+
  /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device
  template <class dev_typ, class t1, class t2, class t3, class t4>
  inline void type_pack4(const int n, const int m_size,
@ -239,7 +239,7 @@ class Atom {

  /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device
  template <class dev_typ, class t1, class t2>
-  inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v, 
+  inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
                         UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) {
    for (int i=0; i<n; i++) {
      buffer[i*2]=static_cast<numtyp>(one[i][i]);
@ -279,7 +279,7 @@ class Atom {

  /// Copy positions and types to device asynchronously
  /** Copies nall() elements **/
-  inline void add_x_data(double **host_ptr, int *host_type) { 
+  inline void add_x_data(double **host_ptr, int *host_type) {
    time_pos.start();
    if (_x_avail==false) {
      #ifdef GPU_CAST
@ -376,7 +376,7 @@ class Atom {

  /// Copy velocities and tags to device asynchronously
  /** Copies nall() elements **/
-  inline void add_v_data(double **host_ptr, tagint *host_tag) { 
+  inline void add_v_data(double **host_ptr, tagint *host_tag) {
    time_vel.start();
    if (_v_avail==false) {
      #ifdef GPU_CAST
@ -407,8 +407,8 @@ class Atom {
  inline void add_transfer_time(double t) { _time_transfer+=t; }

  /// Return number of bytes used on device
-  inline double max_gpu_bytes() 
-    { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } 
+  inline double max_gpu_bytes()
+    { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; }

  /// Returns true if the device is addressing memory on the host
  inline bool host_view() { return _host_view; }
@ -422,7 +422,7 @@ class Atom {
  /// Quaterions
  UCL_Vector<numtyp,numtyp> quat;
  /// Velocities
-  UCL_Vector<numtyp,numtyp> v;  
+  UCL_Vector<numtyp,numtyp> v;

  #ifdef GPU_CAST
  UCL_Vector<double,double> x_cast;
@ -436,7 +436,7 @@ class Atom {

  /// Atom tag information for device nbor builds
  UCL_D_Vec<tagint> dev_tag;
-  
+
  /// Cell list identifiers for hybrid nbor builds
  UCL_H_Vec<int> host_cell_id;
  /// Cell list identifiers for hybrid nbor builds
@ -444,7 +444,7 @@ class Atom {

  /// Device timers
  UCL_Timer time_pos, time_q, time_quat, time_vel;
-  
+
  /// Geryon device
  UCL_Device *dev;

@ -456,19 +456,19 @@ class Atom {
  #endif

  bool _compiled;
-  
+
  // True if data has been copied to device already
  bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;

  bool alloc(const int nall);
-  
+
  bool _allocated, _rot, _charge, _bonds, _vel, _other;
  int _max_atoms, _nall, _gpu_nbor;
  bool _host_view;
  double _time_cast, _time_transfer;
-  
+
  double _max_gpu_bytes;
-  
+
  #ifdef USE_CUDPP
  CUDPPConfiguration sort_config;
  CUDPPHandle sort_plan;
--- a/lib/gpu/lal_balance.h
+++ b/lib/gpu/lal_balance.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -44,7 +44,7 @@ class Balance {
      _init_done=false;
    }
  }
-  
+
  /// Return the timestep since initialization
  inline int timestep() { return _timestep; }

@ -96,7 +96,7 @@ class Balance {
  inline void stop_timer() { if (_measure_this_step) { _device_time.stop(); } }

  /// Calculate the new host/device split based on the cpu and device times
-  /** \note Only does calculation every _HD_BALANCE_EVERY timesteps 
+  /** \note Only does calculation every _HD_BALANCE_EVERY timesteps
            (and first 10) **/
  inline void balance(const double cpu_time);

@ -105,13 +105,13 @@ class Balance {
    balance(cpu_time);
    return get_gpu_count(ago,inum_full);
  }
-  
+
 private:
  Device<numtyp,acctyp> *_device;
  UCL_Timer _device_time;
  bool _init_done;
  int _gpu_nbor;
-  
+
  bool _load_balance;
  double _actual_split, _avg_split, _desired_split, _max_split;
  int _avg_count;
@ -123,15 +123,15 @@ class Balance {
 #define BalanceT Balance<numtyp,acctyp>

 template <class numtyp, class acctyp>
-void BalanceT::init(Device<numtyp, acctyp> *gpu, 
+void BalanceT::init(Device<numtyp, acctyp> *gpu,
                           const int gpu_nbor, const double split) {
  clear();
  _gpu_nbor=gpu_nbor;
  _init_done=true;
-  
+
  _device=gpu;
  _device_time.init(*gpu->gpu);
-  
+
  if (split<0.0) {
    _load_balance=true;
    _desired_split=0.90;
@ -163,7 +163,7 @@ int BalanceT::get_gpu_count(const int ago, const int inum_full) {
  _timestep++;
  return _inum;
 }
-    
+
 template <class numtyp, class acctyp>
 void BalanceT::balance(const double cpu_time) {
  if (_measure_this_step) {
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@ -9,10 +9,10 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/
- 
+
 #include "lal_base_atomic.h"
 using namespace LAMMPS_AL;
 #define BaseAtomicT BaseAtomic<numtyp, acctyp>
@ -63,13 +63,13 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
    _nbor_data=&(nbor->dev_packed);
  } else
    _nbor_data=&(nbor->dev_nbor);
-    
+
  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
                           maxspecial,_gpu_host,max_nbors,cell_size,false,
                           _threads_per_atom);
  if (success!=0)
    return success;
-    
+
  ucl_device=device->gpu;
  atom=&device->atom;

@ -139,7 +139,7 @@ int * BaseAtomicT::reset_nbors(const int nall, const int inum, int *ilist,
  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
-  
+
  return ilist;
 }

@ -188,7 +188,7 @@ void BaseAtomicT::compute(const int f_ago, const int inum_full,
    zero_timers();
    return;
  }
-  
+
  int ago=hd_balancer.ago_first(f_ago);
  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
  ans->inum(inum);
@ -217,7 +217,7 @@ template <class numtyp, class acctyp>
 int ** BaseAtomicT::compute(const int ago, const int inum_full,
                                 const int nall, double **host_x, int *host_type,
                                 double *sublo, double *subhi, tagint *tag,
-                                 int **nspecial, tagint **special, const bool eflag, 
+                                 int **nspecial, tagint **special, const bool eflag,
                                 const bool vflag, const bool eatom,
                                 const bool vatom, int &host_start,
                                 int **ilist, int **jnum,
@ -230,12 +230,12 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full,
    zero_timers();
    return NULL;
  }
-  
+
  hd_balancer.balance(cpu_time);
  int inum=hd_balancer.get_gpu_count(ago,inum_full);
  ans->inum(inum);
  host_start=inum;
- 
+
  // Build neighbor list on GPU if necessary
  if (ago==0) {
    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@ -255,7 +255,7 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full,
  ans->copy_answers(eflag,vflag,eatom,vatom);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
-  
+
  return nbor->host_jlist.begin()-host_start;
 }

--- a/lib/gpu/lal_base_atomic.h
+++ b/lib/gpu/lal_base_atomic.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -41,7 +41,7 @@ class BaseAtomic {
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * \param k_name name for the kernel for force calculation
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -49,8 +49,8 @@ class BaseAtomic {
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init_atomic(const int nlocal, const int nall, const int max_nbors,
-                  const int maxspecial, const double cell_size, 
-                  const double gpu_split, FILE *screen, 
+                  const int maxspecial, const double cell_size,
+                  const double gpu_split, FILE *screen,
                  const void *pair_program, const char *k_name);

  /// Estimate the overhead for GPU context changes and CPU driver
@ -80,7 +80,7 @@ class BaseAtomic {
    * \note host_inum is 0 if the host is performing neighboring
    * \note nlocal+host_inum=total number local particles
    * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
+  inline void resize_local(const int inum, const int host_inum,
                           const int max_nbors, bool &success) {
    nbor->resize(inum,host_inum,max_nbors,success);
  }
@ -119,7 +119,7 @@ class BaseAtomic {
  /// Build neighbor list on device
  void build_nbor_list(const int inum, const int host_inum,
                       const int nall, double **host_x, int *host_type,
-                       double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                       double *sublo, double *subhi, tagint *tag, int **nspecial,
                       tagint **special, bool &success);

  /// Pair loop with host neighboring
@ -133,19 +133,19 @@ class BaseAtomic {
  int * compute(const int ago, const int inum_full,
                const int nall, double **host_x, int *host_type, double *sublo,
                double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                const double cpu_time, bool &success);

  /// Pair loop with device neighboring
  int ** compute(const int ago, const int inum_full,
                 const int nall, double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag, 
-                 const bool eatom, const bool vatom, int &host_start, 
+                 tagint **special, const bool eflag, const bool vflag,
+                 const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success);

-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------

  /// Device Properties and Atom and Neighbor storage
  Device<numtyp,acctyp> *device;
--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@ -10,7 +10,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -64,7 +64,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
    _nbor_data=&(nbor->dev_packed);
  } else
    _nbor_data=&(nbor->dev_nbor);
-    
+
  int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
                           maxspecial,_gpu_host,max_nbors,cell_size,false,
                           _threads_per_atom);
@ -153,7 +153,7 @@ template <class numtyp, class acctyp>
 inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
                                         const int nall, double **host_x,
                                         int *host_type, double *sublo,
-                                         double *subhi, tagint *tag, 
+                                         double *subhi, tagint *tag,
                                         int **nspecial, tagint **special,
                                         bool &success) {
  success=true;
@ -192,7 +192,7 @@ void BaseChargeT::compute(const int f_ago, const int inum_full,
    zero_timers();
    return;
  }
-  
+
  int ago=hd_balancer.ago_first(f_ago);
  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
  ans->inum(inum);
@ -226,7 +226,7 @@ template <class numtyp, class acctyp>
 int** BaseChargeT::compute(const int ago, const int inum_full,
                                const int nall, double **host_x, int *host_type,
                                double *sublo, double *subhi, tagint *tag,
-                                int **nspecial, tagint **special, const bool eflag, 
+                                int **nspecial, tagint **special, const bool eflag,
                                const bool vflag, const bool eatom,
                                const bool vatom, int &host_start,
                                int **ilist, int **jnum,
@ -240,12 +240,12 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
    zero_timers();
    return NULL;
  }
-  
+
  hd_balancer.balance(cpu_time);
  int inum=hd_balancer.get_gpu_count(ago,inum_full);
  ans->inum(inum);
  host_start=inum;
- 
+
  // Build neighbor list on GPU if necessary
  if (ago==0) {
    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@ -271,7 +271,7 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
  ans->copy_answers(eflag,vflag,eatom,vatom);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
-  
+
  return nbor->host_jlist.begin()-host_start;
 }

--- a/lib/gpu/lal_base_charge.h
+++ b/lib/gpu/lal_base_charge.h
@ -10,7 +10,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -42,7 +42,7 @@ class BaseCharge {
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * \param k_name name for the kernel for force calculation
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -83,7 +83,7 @@ class BaseCharge {
    * \note host_inum is 0 if the host is performing neighboring
    * \note nlocal+host_inum=total number local particles
    * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
+  inline void resize_local(const int inum, const int host_inum,
                           const int max_nbors, bool &success) {
    nbor->resize(inum,host_inum,max_nbors,success);
  }
@ -137,12 +137,12 @@ class BaseCharge {
  int** compute(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, double *sublo,
                double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                int **ilist, int **numj, const double cpu_time, bool &success,
                double *charge, double *boxlo, double *prd);

-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------

  /// Device Properties and Atom and Neighbor storage
  Device<numtyp,acctyp> *device;
--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@ -10,7 +10,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -65,7 +65,7 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
    _nbor_data=&(nbor->dev_packed);
  } else
    _nbor_data=&(nbor->dev_nbor);
-    
+
  int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor,
                           maxspecial,_gpu_host,max_nbors,cell_size,false,
                           _threads_per_atom);
@ -155,7 +155,7 @@ template <class numtyp, class acctyp>
 inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum,
                                         const int nall, double **host_x,
                                         int *host_type, double *sublo,
-                                         double *subhi, tagint *tag, 
+                                         double *subhi, tagint *tag,
                                         int **nspecial, tagint **special,
                                         bool &success) {
  success=true;
@ -194,7 +194,7 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full,
    zero_timers();
    return;
  }
-  
+
  int ago=hd_balancer.ago_first(f_ago);
  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
  ans->inum(inum);
@ -230,12 +230,12 @@ template <class numtyp, class acctyp>
 int** BaseDipoleT::compute(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           double *sublo, double *subhi, tagint *tag,
-                           int **nspecial, tagint **special, const bool eflag, 
+                           int **nspecial, tagint **special, const bool eflag,
                           const bool vflag, const bool eatom,
                           const bool vatom, int &host_start,
                           int **ilist, int **jnum,
                           const double cpu_time, bool &success,
-                           double *host_q, double **host_mu, 
+                           double *host_q, double **host_mu,
                           double *boxlo, double *prd) {
  acc_timers();
  if (inum_full==0) {
@ -245,12 +245,12 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
    zero_timers();
    return NULL;
  }
-  
+
  hd_balancer.balance(cpu_time);
  int inum=hd_balancer.get_gpu_count(ago,inum_full);
  ans->inum(inum);
  host_start=inum;
- 
+
  // Build neighbor list on GPU if necessary
  if (ago==0) {
    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@ -279,7 +279,7 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
  ans->copy_answers(eflag,vflag,eatom,vatom);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
-  
+
  return nbor->host_jlist.begin()-host_start;
 }

--- a/lib/gpu/lal_base_dipole.h
+++ b/lib/gpu/lal_base_dipole.h
@ -10,7 +10,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -40,7 +40,7 @@ class BaseDipole {
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * \param k_name name for the kernel for force calculation
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -82,7 +82,7 @@ class BaseDipole {
    * \note host_inum is 0 if the host is performing neighboring
    * \note nlocal+host_inum=total number local particles
    * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
+  inline void resize_local(const int inum, const int host_inum,
                           const int max_nbors, bool &success) {
    nbor->resize(inum,host_inum,max_nbors,success);
  }
@ -136,12 +136,12 @@ class BaseDipole {
  int** compute(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, double *sublo,
                double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                int **ilist, int **numj, const double cpu_time, bool &success,
                double *charge, double **mu, double *boxlo, double *prd);

-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------

  /// Device Properties and Atom and Neighbor storage
  Device<numtyp,acctyp> *device;
--- a/lib/gpu/lal_base_dpd.cpp
+++ b/lib/gpu/lal_base_dpd.cpp
@ -64,7 +64,7 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
    _nbor_data=&(nbor->dev_packed);
  } else
    _nbor_data=&(nbor->dev_nbor);
-    
+
  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
                           maxspecial,_gpu_host,max_nbors,cell_size,false,
                           _threads_per_atom,true);
@ -153,7 +153,7 @@ template <class numtyp, class acctyp>
 inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum,
                                      const int nall, double **host_x,
                                      int *host_type, double *sublo,
-                                      double *subhi, tagint *tag, 
+                                      double *subhi, tagint *tag,
                                      int **nspecial, tagint **special,
                                      bool &success) {
  success=true;
@ -182,7 +182,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full,
                       const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom,
                       int &host_start, const double cpu_time,
-                       bool &success, tagint *tag, double **host_v, 
+                       bool &success, tagint *tag, double **host_v,
                       const double dtinvsqrt, const int seed, const int timestep,
                       const int nlocal, double *boxlo, double *prd) {
  acc_timers();
@ -193,7 +193,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full,
    zero_timers();
    return;
  }
-  
+
  int ago=hd_balancer.ago_first(f_ago);
  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
  ans->inum(inum);
@ -228,12 +228,12 @@ template <class numtyp, class acctyp>
 int** BaseDPDT::compute(const int ago, const int inum_full,
                        const int nall, double **host_x, int *host_type,
                        double *sublo, double *subhi, tagint *tag,
-                        int **nspecial, tagint **special, const bool eflag, 
+                        int **nspecial, tagint **special, const bool eflag,
                        const bool vflag, const bool eatom,
                        const bool vatom, int &host_start,
                        int **ilist, int **jnum,
                        const double cpu_time, bool &success,
-                        double **host_v, const double dtinvsqrt, 
+                        double **host_v, const double dtinvsqrt,
                        const int seed, const int timestep,
                        double *boxlo, double *prd) {
  acc_timers();
@ -244,12 +244,12 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
    zero_timers();
    return NULL;
  }
-  
+
  hd_balancer.balance(cpu_time);
  int inum=hd_balancer.get_gpu_count(ago,inum_full);
  ans->inum(inum);
  host_start=inum;
- 
+
  // Build neighbor list on GPU if necessary
  if (ago==0) {
    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@ -276,7 +276,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
  ans->copy_answers(eflag,vflag,eatom,vatom);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
-  
+
  return nbor->host_jlist.begin()-host_start;
 }

--- a/lib/gpu/lal_base_dpd.h
+++ b/lib/gpu/lal_base_dpd.h
@ -40,7 +40,7 @@ class BaseDPD {
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * \param k_name name for the kernel for force calculation
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -81,7 +81,7 @@ class BaseDPD {
    * \note host_inum is 0 if the host is performing neighboring
    * \note nlocal+host_inum=total number local particles
    * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
+  inline void resize_local(const int inum, const int host_inum,
                           const int max_nbors, bool &success) {
    nbor->resize(inum,host_inum,max_nbors,success);
  }
@ -129,20 +129,20 @@ class BaseDPD {
               int **firstneigh, const bool eflag, const bool vflag,
               const bool eatom, const bool vatom, int &host_start,
               const double cpu_time, bool &success, tagint *tag,
-               double **v, const double dtinvsqrt, const int seed, 
+               double **v, const double dtinvsqrt, const int seed,
               const int timestep, const int nlocal, double *boxlo, double *prd);

  /// Pair loop with device neighboring
  int** compute(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, double *sublo,
                double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                int **ilist, int **numj, const double cpu_time, bool &success,
                double **v, const double dtinvsqrt, const int seed,
                const int timestep, double *boxlo, double *prd);

-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------

  /// Device Properties and Atom and Neighbor storage
  Device<numtyp,acctyp> *device;
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@ -70,7 +70,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
    _gpu_host=1;

  _threads_per_atom=device->threads_per_atom();
-    
+
  int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,
                           maxspecial,_gpu_host,max_nbors,cell_size,true,
                           1);
@ -113,7 +113,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
    return -8;
  if (_multiple_forms && gpu_nbor!=0)
    return -9;
-  
+
  if (_multiple_forms)
    ans->force.zero();

@ -142,7 +142,7 @@ void BaseEllipsoidT::clear_base() {
  // Output any timing information
  output_times();
  host_olist.clear();
-  
+
  if (_compiled) {
    k_nbor_fast.clear();
    k_nbor.clear();
@ -156,7 +156,7 @@ void BaseEllipsoidT::clear_base() {
    delete lj_program;
    _compiled=false;
  }
- 
+
  time_nbor1.clear();
  time_ellipsoid.clear();
  time_nbor2.clear();
@ -230,7 +230,7 @@ void BaseEllipsoidT::output_times() {
      if (times[6]>0)
        fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
      fprintf(screen,"Average split:   %.4f.\n",avg_split);
-      fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);      
+      fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);
      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
      fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
      fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[8]/replica_size);
@ -241,10 +241,10 @@ void BaseEllipsoidT::output_times() {
 }

 // ---------------------------------------------------------------------------
-// Pack neighbors to limit thread divergence for lj-lj and ellipse 
+// Pack neighbors to limit thread divergence for lj-lj and ellipse
 // ---------------------------------------------------------------------------
 template<class numtyp, class acctyp>
-void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, 
+void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
                                const int inum, const int form_low,
                                const int form_high, const bool shared_types,
                                int ntypes) {
@ -264,18 +264,18 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
 // Copy neighbor list from host
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseEllipsoidT::reset_nbors(const int nall, const int inum, 
+void BaseEllipsoidT::reset_nbors(const int nall, const int inum,
                                 const int osize, int *ilist,
                                 int *numj, int *type, int **firstneigh,
                                 bool &success) {
  success=true;
-    
+
  int mn=nbor->max_nbor_loop(osize,numj,ilist);
  resize_atom(nall,success);
  resize_local(inum,0,mn,osize,success);
  if (!success)
    return;
-    
+
  if (_multiple_forms) {
    int p=0;
    for (int i=0; i<osize; i++) {
@ -315,7 +315,7 @@ template <class numtyp, class acctyp>
 inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum,
                                            const int nall, double **host_x,
                                            int *host_type, double *sublo,
-                                            double *subhi, tagint *tag, 
+                                            double *subhi, tagint *tag,
                                            int **nspecial, tagint **special,
                                            bool &success) {
  success=true;
@ -354,7 +354,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
    zero_timers();
    return NULL;
  }
-  
+
  int ago=hd_balancer.ago_first(f_ago);
  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
  ans->inum(inum);
@ -394,7 +394,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
                              double **host_x, int *host_type, double *sublo,
                              double *subhi, tagint *tag, int **nspecial,
                              tagint **special, const bool eflag, const bool vflag,
-                              const bool eatom, const bool vatom, 
+                              const bool eatom, const bool vatom,
                              int &host_start, int **ilist, int **jnum,
                              const double cpu_time, bool &success,
                              double **host_quat) {
@ -410,7 +410,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
  ans->inum(inum);
  _last_ellipse=std::min(inum,_max_last_ellipse);
  host_start=inum;
-  
+
  // Build neighbor list on GPU if necessary
  if (ago==0) {
    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@ -419,7 +419,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
      return NULL;
    atom->cast_quat_data(host_quat[0]);
    hd_balancer.start_timer();
-  } else {    
+  } else {
    atom->cast_x_data(host_x,host_type);
    atom->cast_quat_data(host_quat[0]);
    hd_balancer.start_timer();
@ -444,9 +444,9 @@ double BaseEllipsoidT::host_memory_usage_base() const {
 }

 template <class numtyp, class acctyp>
-void BaseEllipsoidT::compile_kernels(UCL_Device &dev, 
+void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
                                     const void *ellipsoid_string,
-                                     const void *lj_string, 
+                                     const void *lj_string,
                                     const char *kname, const bool e_s) {
  if (_compiled)
    return;
--- a/lib/gpu/lal_base_ellipsoid.h
+++ b/lib/gpu/lal_base_ellipsoid.h
@ -42,7 +42,7 @@ class BaseEllipsoid {
    * \param gpu_split fraction of particles handled by device
    * \param ellipsoid_sphere true if ellipsoid-sphere case handled separately
    * \param k_name name for the kernel for force calculation
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -68,7 +68,7 @@ class BaseEllipsoid {
      quat_tex.bind_float(atom->quat,4);
      lj_pos_tex.bind_float(atom->x,4);
      lj_quat_tex.bind_float(atom->quat,4);
-    }      
+    }
  }

  /// Check if there is enough storage for neighbors and realloc if not
@ -78,7 +78,7 @@ class BaseEllipsoid {
    * \param olist_size size of list of particles from CPU neighboring
    * \note host_inum is 0 if the host is performing neighboring
    * \note if GPU is neighboring nlocal+host_inum=total number local particles
-    * \note if CPU is neighboring olist_size=total number of local particles 
+    * \note if CPU is neighboring olist_size=total number of local particles
    * \note if GPU is neighboring olist_size=0 **/
  inline void resize_local(const int nlocal, const int host_inum,
                           const int max_nbors, const int olist_size,
@ -101,7 +101,7 @@ class BaseEllipsoid {
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear_base();
-  
+
  /// Output any timing information
  void output_times();

@ -130,7 +130,7 @@ class BaseEllipsoid {
      ans->acc_timers();
    }
  }
-  
+
  /// Zero timers
  inline void zero_timers() {
    time_nbor1.zero();
@ -148,9 +148,9 @@ class BaseEllipsoid {
    ans->zero_timers();
  }

-  /// Pack neighbors to limit thread divergence for lj-lj and ellipse 
+  /// Pack neighbors to limit thread divergence for lj-lj and ellipse
  void pack_nbors(const int GX, const int BX, const int start, const int inum,
-                  const int form_low, const int form_high, 
+                  const int form_low, const int form_high,
                  const bool shared_types, int ntypes);

  /// Copy neighbor list from host
@ -174,17 +174,17 @@ class BaseEllipsoid {
  int** compute(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, double *sublo,
                double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                int **ilist, int **numj, const double cpu_time, bool &success,
                double **host_quat);

  /// Build neighbor list on accelerator
-  void build_nbor_list(const int inum, const int host_inum, const int nall, 
+  void build_nbor_list(const int inum, const int host_inum, const int nall,
                       double **host_x, int *host_type, double *sublo,
                       double *subhi, bool &success);
-                       
-  // -------------------------- DEVICE DATA ------------------------- 
+
+  // -------------------------- DEVICE DATA -------------------------

  /// Device Properties and Atom and Neighbor storage
  Device<numtyp,acctyp> *device;
@ -207,7 +207,7 @@ class BaseEllipsoid {
  /// Atom Data
  Atom<numtyp,acctyp> *atom;

-  // --------------------------- TYPE DATA -------------------------- 
+  // --------------------------- TYPE DATA --------------------------

  /// cut_form.x = cutsq, cut_form.y = form
  UCL_D_Vec<numtyp2> cut_form;
@ -240,7 +240,7 @@ class BaseEllipsoid {
  double _gpu_overhead, _driver_overhead;
  UCL_D_Vec<int> *_nbor_data;

-  // True if we want to use fast GB-sphere or sphere-sphere calculations 
+  // True if we want to use fast GB-sphere or sphere-sphere calculations
  bool _multiple_forms;
  int **_host_form;
  int _last_ellipse, _max_last_ellipse;
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@ -12,7 +12,7 @@
    begin                : Tue April 2, 2013
    email                : brownw@ornl.gov
 ***************************************************************************/
- 
+
 #include "lal_base_three.h"
 using namespace LAMMPS_AL;
 #define BaseThreeT BaseThree<numtyp, acctyp>
@ -45,7 +45,7 @@ int BaseThreeT::bytes_per_atom_atomic(const int max_nbors) const {
  #ifdef THREE_CONCURRENT
  b+=ans2->bytes_per_atom();
  #endif
-  return b;     
+  return b;
 }

 template <class numtyp, class acctyp>
@ -62,6 +62,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
    gpu_nbor=1;
  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
    gpu_nbor=2;
+  _gpu_nbor=gpu_nbor;

  int _gpu_host=0;
  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
@ -76,7 +77,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
    _nbor_data=&(nbor->dev_nbor);
  if (_threads_per_atom*_threads_per_atom>device->warp_size())
    return -10;
-    
+
  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
                           maxspecial,_gpu_host,max_nbors,cell_size,false,
                           _threads_per_atom);
@ -93,7 +94,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
    return -3;
  ans2->cq(_end_command_queue);
  #endif
-    
+
  _block_pair=device->pair_block_size();
  _block_size=device->block_ellipse();
  compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);
@ -111,7 +112,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
  #ifdef THREE_CONCURRENT
  _max_an_bytes+=ans2->gpu_bytes();
  #endif
-  
+
  return 0;
 }

@ -158,7 +159,7 @@ void BaseThreeT::clear_atomic() {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
-                              int *ilist, int *numj, int **firstneigh, 
+                              int *ilist, int *numj, int **firstneigh,
                              bool &success) {
  success=true;

@ -168,7 +169,12 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
  if (!success)
    return NULL;

-  nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
+  // originally the requirement that nall == nlist was enforced
+  // to allow direct indexing neighbors of neighbors after re-arrangement
+//  nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
+
+  // now the requirement is removed, allowing to work within pair hybrid
+  nbor->get_host(nlist,ilist,numj,firstneigh,block_size());

  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  #ifdef THREE_CONCURRENT
@ -176,7 +182,7 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
  #endif
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
-  
+
  return ilist;
 }

@ -185,11 +191,11 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
-                                         const int nall, double **host_x,
-                                         int *host_type, double *sublo,
-                                         double *subhi, tagint *tag,
-                                         int **nspecial, tagint **special,
-                                         bool &success) {
+                                       const int nall, double **host_x,
+                                       int *host_type, double *sublo,
+                                       double *subhi, tagint *tag,
+                                       int **nspecial, tagint **special,
+                                       bool &success) {
  success=true;
  resize_atom(inum,nall,success);
  resize_local(nall,host_inum,nbor->max_nbors(),success);
@ -214,11 +220,11 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall, 
+void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
                         const int nlist, double **host_x, int *host_type,
-                         int *ilist, int *numj, int **firstneigh, 
+                         int *ilist, int *numj, int **firstneigh,
                         const bool eflag, const bool vflag, const bool eatom,
-                         const bool vatom, int &host_start, 
+                         const bool vatom, int &host_start,
                         const double cpu_time, bool &success) {
  acc_timers();
  if (nlist==0) {
@ -228,9 +234,9 @@ void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall,
    zero_timers();
    return;
  }
-  
+
  int ago=hd_balancer.ago_first(f_ago);
-  int inum=hd_balancer.balance(ago,nlocal,cpu_time);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
  ans->inum(inum);
  #ifdef THREE_CONCURRENT
  ans2->inum(inum);
@ -270,7 +276,7 @@ template <class numtyp, class acctyp>
 int ** BaseThreeT::compute(const int ago, const int inum_full,
                                 const int nall, double **host_x, int *host_type,
                                 double *sublo, double *subhi, tagint *tag,
-                                 int **nspecial, tagint **special, const bool eflag, 
+                                 int **nspecial, tagint **special, const bool eflag,
                                 const bool vflag, const bool eatom,
                                 const bool vatom, int &host_start,
                                 int **ilist, int **jnum,
@ -283,7 +289,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
    zero_timers();
    return NULL;
  }
-  
+
  hd_balancer.balance(cpu_time);
  int inum=hd_balancer.get_gpu_count(ago,inum_full);
  ans->inum(inum);
@ -291,7 +297,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
  ans2->inum(inum);
  #endif
  host_start=inum;
- 
+
  // Build neighbor list on GPU if necessary
  if (ago==0) {
    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
@ -321,7 +327,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
  device->add_ans_object(ans2);
  #endif
  hd_balancer.stop_timer();
-  
+
  return nbor->host_jlist.begin()-host_start;
 }

@ -352,7 +358,7 @@ void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
  k_three_end.cq(ucl_device->cq(_end_command_queue));
  k_three_end_vatom.cq(ucl_device->cq(_end_command_queue));
  #endif
-  
+
  _compiled=true;
 }

--- a/lib/gpu/lal_base_three.h
+++ b/lib/gpu/lal_base_three.h
@ -44,7 +44,7 @@ class BaseThree {
    * \param gpu_split fraction of particles handled by device
    * \param k_two name for the kernel for 2-body force calculation
    * \param k_three name for the kernel for 3-body force calculation
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -53,8 +53,8 @@ class BaseThree {
    * - -5 Double precision is not supported on card
    * - -10 if invalid thread_per_atom setting **/
  int init_three(const int nlocal, const int nall, const int max_nbors,
-                 const int maxspecial, const double cell_size, 
-                 const double gpu_split, FILE *screen, 
+                 const int maxspecial, const double cell_size,
+                 const double gpu_split, FILE *screen,
                 const void *pair_program, const char *k_two,
                 const char *k_three_center, const char *k_three_end);

@ -88,7 +88,7 @@ class BaseThree {
    * \note host_inum is 0 if the host is performing neighboring
    * \note nlocal+host_inum=total number local particles
    * \note olist_size=0 **/
-  inline void resize_local(const int inum, const int host_inum, 
+  inline void resize_local(const int inum, const int host_inum,
                           const int max_nbors, bool &success) {
    nbor->resize(inum,host_inum,max_nbors,success);
  }
@ -133,33 +133,33 @@ class BaseThree {
  /// Build neighbor list on device
  int build_nbor_list(const int inum, const int host_inum,
                       const int nall, double **host_x, int *host_type,
-                       double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                       double *sublo, double *subhi, tagint *tag, int **nspecial,
                       tagint **special, bool &success);

  /// Pair loop with host neighboring
-  void compute(const int f_ago, const int inum_full, const int nall, 
+  void compute(const int f_ago, const int inum_full, const int nall,
               const int nlist, double **host_x, int *host_type,
               int *ilist, int *numj, int **firstneigh, const bool eflag,
               const bool vflag, const bool eatom, const bool vatom,
               int &host_start, const double cpu_time, bool &success);

  /// Pair loop with device neighboring
-  int * compute(const int ago, const int inum_full, const int nall, 
+  int * compute(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, double *sublo,
                double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag, 
-                const bool eatom, const bool vatom, int &host_start, 
+                tagint **special, const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
                const double cpu_time, bool &success);

  /// Pair loop with device neighboring
  int ** compute(const int ago, const int inum_full,
                 const int nall, double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag, 
-                 const bool eatom, const bool vatom, int &host_start, 
+                 tagint **special, const bool eflag, const bool vflag,
+                 const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success);

-  // -------------------------- DEVICE DATA ------------------------- 
+  // -------------------------- DEVICE DATA -------------------------

  /// Device Properties and Atom and Neighbor storage
  Device<numtyp,acctyp> *device;
@ -186,7 +186,7 @@ class BaseThree {
  Answer<numtyp,acctyp> *ans;
  #ifdef THREE_CONCURRENT
  Answer<numtyp,acctyp> *ans2;
-  #endif  
+  #endif

  // --------------------------- NBOR DATA ----------------------------

@ -205,15 +205,16 @@ class BaseThree {
 protected:
  bool _compiled;
  int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
+  int _gpu_nbor;
  double _max_bytes, _max_an_bytes;
  double _gpu_overhead, _driver_overhead;
  UCL_D_Vec<int> *_nbor_data;

-  void compile_kernels(UCL_Device &dev, const void *pair_string, 
+  void compile_kernels(UCL_Device &dev, const void *pair_string,
                       const char *k_two, const char *k_three_center,
                       const char *k_three_end);

-  virtual void loop(const bool _eflag, const bool _vflag, 
+  virtual void loop(const bool _eflag, const bool _vflag,
                    const int evatom) = 0;
 };

--- a/lib/gpu/lal_beck.cpp
+++ b/lib/gpu/lal_beck.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -33,17 +33,17 @@ BeckT::Beck() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }

 template <class numtyp, class acctyp>
-BeckT::~Beck() { 
+BeckT::~Beck() {
  clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BeckT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }

 template <class numtyp, class acctyp>
-int BeckT::init(const int ntypes, 
+int BeckT::init(const int ntypes,
                double **host_cutsq, double **host_aa,
                double **host_alpha, double **host_beta,
                double **host_AA, double **host_BB,
@ -126,7 +126,7 @@ void BeckT::loop(const bool _eflag, const bool _vflag) {
    vflag=1;
  else
    vflag=0;
-  
+
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

--- a/lib/gpu/lal_beck.cu
+++ b/lib/gpu/lal_beck.cu
@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

@ -24,7 +24,7 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif

-__kernel void k_beck(const __global numtyp4 *restrict x_, 
+__kernel void k_beck(const __global numtyp4 *restrict x_,
                     const __global numtyp4 *restrict beck1,
                     const __global numtyp4 *restrict beck2,
                     const int lj_types,
@ -50,20 +50,20 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
@ -76,7 +76,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
      int mtype=itype*lj_types+jtype;
      if (rsq<beck2[mtype].z) {
        numtyp r = ucl_sqrt(rsq);
@ -103,7 +103,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
          numtyp term1inv = ucl_recip(term1);
          numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
          e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
@ -133,7 +133,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
                          const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
-  
+
  __local numtyp4 beck1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 beck2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
@ -143,7 +143,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
    beck1[tid]=beck1_in[tid];
    beck2[tid]=beck2_in[tid];
  }
-  
+
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@ -152,7 +152,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;

  __syncthreads();
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
@ -166,7 +166,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
@ -179,7 +179,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
      if (rsq<beck2[mtype].z) {
        numtyp r = ucl_sqrt(rsq);
        numtyp r5 = rsq*rsq*r;
@ -205,7 +205,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
          numtyp term1inv = ucl_recip(term1);
          numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
          e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
-          energy+=factor_lj*e; 
+          energy+=factor_lj*e;
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
--- a/lib/gpu/lal_beck.h
+++ b/lib/gpu/lal_beck.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Beck : public BaseAtomic<numtyp, acctyp> {
 public:
  Beck();
-  ~Beck(); 
+  ~Beck();

  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -41,8 +41,8 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
           double **host_aa, double **host_alpha,
           double **host_beta, double **host_AA,
           double **host_BB, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen);

  /// Clear all host and device data
@ -67,7 +67,7 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;

-  /// Number of atom types 
+  /// Number of atom types
  int _lj_types;

 private:
--- a/lib/gpu/lal_beck_ext.cpp
+++ b/lib/gpu/lal_beck_ext.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -77,7 +77,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
                        cell_size, gpu_split, screen);

    BLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
      fprintf(screen,"Done.\n");
  }
  if (message)
@ -102,8 +102,8 @@ int ** beck_gpu_compute_n(const int ago, const int inum_full,
  return BLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                      subhi, tag, nspecial, special, eflag, vflag, eatom,
                      vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void beck_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
--- a/lib/gpu/lal_born.cpp
+++ b/lib/gpu/lal_born.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -33,10 +33,10 @@ BornT::Born() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }

 template <class numtyp, class acctyp>
-BornT::~Born() { 
+BornT::~Born() {
  clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BornT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
@ -44,12 +44,12 @@ int BornT::bytes_per_atom(const int max_nbors) const {

 template <class numtyp, class acctyp>
 int BornT::init(const int ntypes, double **host_cutsq,
-                double **host_rhoinv, double **host_born1, double **host_born2, 
+                double **host_rhoinv, double **host_born1, double **host_born2,
                double **host_born3, double **host_a, double **host_c,
                double **host_d, double **host_sigma,
                double **host_offset, double *host_special_lj,
-                const int nlocal, const int nall, const int max_nbors, 
-                const int maxspecial, const double cell_size, 
+                const int nlocal, const int nall, const int max_nbors,
+                const int maxspecial, const double cell_size,
                const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@ -80,7 +80,7 @@ int BornT::init(const int ntypes, double **host_cutsq,

  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		                     host_d,host_offset);
+                                     host_d,host_offset);

  cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack2(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
@ -102,18 +102,18 @@ void BornT::reinit(const int ntypes, double **host_rhoinv,
                   double **host_born1, double **host_born2,
                   double **host_born3, double **host_a, double **host_c,
                   double **host_d, double **host_offset) {
-  
+
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                               UCL_WRITE_ONLY);
-  
+
  for (int i=0; i<_lj_types*_lj_types; i++)
    host_write[i]=0.0;
-  
+
  this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
                         host_born1,host_born2,host_born3);
  this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
-		                     host_d,host_offset);
+                                     host_d,host_offset);
 }

 template <class numtyp, class acctyp>
@ -151,7 +151,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
    vflag=1;
  else
    vflag=0;
-  
+
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -169,7 +169,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &coeff1, &coeff2,
-                     &cutsq_sigma, &_lj_types, &sp_lj, 
+                     &cutsq_sigma, &_lj_types, &sp_lj,
                     &this->nbor->dev_nbor,
                     &this->_nbor_data->begin(), &this->ans->force,
                     &this->ans->engv, &eflag, &vflag, &ainum,
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

@ -24,16 +24,16 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif

-__kernel void k_born(const __global numtyp4 *restrict x_, 
+__kernel void k_born(const __global numtyp4 *restrict x_,
                     const __global numtyp4 *restrict coeff1,
-                     const __global numtyp4 *restrict coeff2, 
+                     const __global numtyp4 *restrict coeff2,
                     const __global numtyp2 *restrict cutsq_sigma,
-                     const int lj_types, 
-                     const __global numtyp *restrict sp_lj_in, 
+                     const int lj_types,
+                     const __global numtyp *restrict sp_lj_in,
                     const __global int *dev_nbor,
-                     const __global int *dev_packed, 
+                     const __global int *dev_packed,
                     __global acctyp4 *restrict ans,
-                     __global acctyp *restrict engv, 
+                     __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
@ -51,20 +51,20 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
@ -77,17 +77,17 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
      int mtype=itype*lj_types+jtype;
      if (r2inv<cutsq_sigma[mtype].x) {
        numtyp r=ucl_sqrt(r2inv);
        numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
        r2inv=ucl_recip(r2inv);
        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
+        numtyp force = r2inv*(coeff1[mtype].y*r*rexp
                - coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
        force*=factor_lj;
-      
+
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
@ -95,7 +95,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
        if (eflag>0) {
          numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
            + coeff2[mtype].z*r2inv*r6inv;
-          energy+=factor_lj*(e-coeff2[mtype].w); 
+          energy+=factor_lj*(e-coeff2[mtype].w);
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
@ -113,20 +113,20 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
  } // if ii
 }

-__kernel void k_born_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_born_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict coeff1_in,
-                          const __global numtyp4 *restrict coeff2_in, 
+                          const __global numtyp4 *restrict coeff2_in,
                          const __global numtyp2 *restrict cutsq_sigma,
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
-                          __global acctyp4 *restrict ans, 
-                          __global acctyp *restrict engv, 
-                          const int eflag, const int vflag, const int inum, 
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv,
+                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
-  
+
  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
@ -137,7 +137,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
    if (eflag>0)
      coeff2[tid]=coeff2_in[tid];
  }
-  
+
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@ -146,7 +146,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;

  __syncthreads();
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
@ -160,7 +160,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
@ -173,13 +173,13 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
      if (r2inv<cutsq_sigma[mtype].x) {
        numtyp r=ucl_sqrt(r2inv);
        numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
        r2inv=ucl_recip(r2inv);
        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
+        numtyp force = r2inv*(coeff1[mtype].y*r*rexp
          - coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
        force*=factor_lj;

@ -190,7 +190,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
        if (eflag>0) {
          numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
            + coeff2[mtype].z*r2inv*r6inv;
-          energy+=factor_lj*(e-coeff2[mtype].w);  
+          energy+=factor_lj*(e-coeff2[mtype].w);
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
--- a/lib/gpu/lal_born.h
+++ b/lib/gpu/lal_born.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Born : public BaseAtomic<numtyp, acctyp> {
 public:
  Born();
-  ~Born(); 
+  ~Born();

  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -38,20 +38,20 @@ class Born : public BaseAtomic<numtyp, acctyp> {
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq,
-           double **host_rhoinv, double **host_born1, double **host_born2, 
+           double **host_rhoinv, double **host_born1, double **host_born2,
           double **host_born3, double **host_a, double **host_c,
-           double **host_d, double **host_sigma, 
+           double **host_d, double **host_sigma,
           double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen);
-  
+
  /// Send updated coeffs from host to device (to be compatible with fix adapt)
  void reinit(const int ntypes, double **host_rhoinv,
              double **host_born1, double **host_born2,
              double **host_born3, double **host_a, double **host_c,
              double **host_d, double **host_offset);
-       
+
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
@ -77,7 +77,7 @@ class Born : public BaseAtomic<numtyp, acctyp> {
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;

-  /// Number of atom types 
+  /// Number of atom types
  int _lj_types;

 private:
--- a/lib/gpu/lal_born_coul_long.cpp
+++ b/lib/gpu/lal_born_coul_long.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -37,17 +37,17 @@ template <class numtyp, class acctyp>
 BornCoulLongT::~BornCoulLongT() {
  clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BornCoulLongT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }

 template <class numtyp, class acctyp>
-int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
-                       double **host_born1, double **host_born2, double **host_born3, 
-                       double **host_a, double **host_c, double **host_d, 
-                       double **host_sigma, double **host_offset, 
+int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
+                       double **host_born1, double **host_born2, double **host_born3,
+                       double **host_a, double **host_c, double **host_d,
+                       double **host_sigma, double **host_offset,
                       double *host_special_lj, const int nlocal,
                       const int nall, const int max_nbors,
                       const int maxspecial, const double cell_size,
@ -84,12 +84,12 @@ int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rho

  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		         host_d,host_offset);
-  
+                         host_d,host_offset);
+
  cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
             host_cut_ljsq,host_sigma);
-  
+
  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
    host_write[i]=host_special_lj[i];
@ -142,7 +142,7 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
    vflag=1;
  else
    vflag=0;
-  
+
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -157,15 +157,15 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
                          &this->ans->force,
                          &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->atom->q,
-                          &cutsq_sigma, &_cut_coulsq, &_qqrd2e, 
+                          &cutsq_sigma, &_cut_coulsq, &_qqrd2e,
                          &_g_ewald, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, 
+    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->ans->force, &this->ans->engv, 
+                   &this->ans->force, &this->ans->engv,
                   &eflag, &vflag, &ainum,
-                   &nbor_pitch, &this->atom->q, 
+                   &nbor_pitch, &this->atom->q,
                   &cutsq_sigma, &_cut_coulsq,
                   &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
  }
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

@ -29,19 +29,19 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif

-__kernel void k_born_long(const __global numtyp4 *restrict x_, 
+__kernel void k_born_long(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict coeff1,
                          const __global numtyp4 *restrict coeff2,
-                          const int lj_types, 
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
+                          const int lj_types,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
                          __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
+                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
-                          const int nbor_pitch, 
+                          const int nbor_pitch,
                          const __global numtyp *restrict q_,
-                          const __global numtyp4 *restrict cutsq_sigma, 
+                          const __global numtyp4 *restrict cutsq_sigma,
                          const numtyp cut_coulsq, const numtyp qqrd2e,
                          const numtyp g_ewald, const int t_per_atom) {
  int tid, ii, offset;
@ -64,14 +64,14 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;
@ -114,129 +114,129 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
          numtyp r = ucl_sqrt(rsq);
          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
          r6inv = r2inv*r2inv*r2inv;
-          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
-            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
-        } else forceborn = (numtyp)0.0;
-
-        force = (forceborn + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < coeff1[mtype].w) {
-            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
-              + coeff2[mtype].z*r2inv*r6inv;
-            energy+=factor_lj*(e-coeff2[mtype].w);
-          } 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
-  } // if ii
-}
-
-__kernel void k_born_long_fast(const __global numtyp4 *restrict x_, 
-                               const __global numtyp4 *restrict coeff1_in,
-                               const __global numtyp4 *restrict coeff2_in, 
-                               const __global numtyp *restrict sp_lj_in,
-                               const __global int *dev_nbor, 
-                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans, 
-                               __global acctyp *restrict engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, 
-                               const __global numtyp *restrict q_,
-                               const __global numtyp4 *restrict cutsq_sigma,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const int t_per_atom) {
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[8];
-  if (tid<8)
-    sp_lj[tid]=sp_lj_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
-      coeff2[tid]=coeff2_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    __local int n_stride;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-  
-    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    numtyp qtmp; fetch(qtmp,i,q_tex);
-    int iw=ix.w;
-    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-      int j=dev_packed[nbor];
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq<cutsq_sigma[mtype].x) {
-        numtyp r2inv=ucl_recip(rsq);
-        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
-        numtyp rexp = (numtyp)0.0;
-        
-        if (rsq < cut_coulsq) {
-          numtyp r=ucl_rsqrt(r2inv);
-          numtyp grij = g_ewald * r;
-          numtyp expm2 = ucl_exp(-grij*grij);
-          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          fetch(prefactor,j,q_tex);
-          prefactor *= qqrd2e * qtmp/r;
-          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else forcecoul = (numtyp)0.0;
-
-        if (rsq < cutsq_sigma[mtype].y) {
-          numtyp r = ucl_sqrt(rsq);
-          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
-          r6inv = r2inv*r2inv*r2inv;
-          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
+            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
+        } else forceborn = (numtyp)0.0;
+
+        force = (forceborn + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < coeff1[mtype].w) {
+            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+              + coeff2[mtype].z*r2inv*r6inv;
+            energy+=factor_lj*(e-coeff2[mtype].w);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
+                               const __global numtyp4 *restrict coeff1_in,
+                               const __global numtyp4 *restrict coeff2_in,
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor,
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch,
+                               const __global numtyp *restrict q_,
+                               const __global numtyp4 *restrict cutsq_sigma,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    coeff1[tid]=coeff1_in[tid];
+    if (eflag>0)
+      coeff2[tid]=coeff2_in[tid];
+  }
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    __local int n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq_sigma[mtype].x) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
+        numtyp rexp = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r=ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else forcecoul = (numtyp)0.0;
+
+        if (rsq < cutsq_sigma[mtype].y) {
+          numtyp r = ucl_sqrt(rsq);
+          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
+          r6inv = r2inv*r2inv*r2inv;
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
        } else forceborn = (numtyp)0.0;

--- a/lib/gpu/lal_born_coul_long.h
+++ b/lib/gpu/lal_born_coul_long.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -30,19 +30,19 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
-           double **host_born1, double **host_born2, double **host_born3, 
-           double **host_a, double **host_c, double **host_d, 
+  int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
+           double **host_born1, double **host_born2, double **host_born3,
+           double **host_a, double **host_c, double **host_d,
           double **host_sigma, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen, double **host_cut_ljsq,
           const double host_cut_coulsq, double *host_special_coul,
           const double qqrd2e, const double g_ewald);
@ -59,12 +59,12 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {

  // --------------------------- TYPE DATA --------------------------

-  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, 
+  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
  /// coeff1.w = born3
  UCL_D_Vec<numtyp4> coeff1;
  /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
  UCL_D_Vec<numtyp4> coeff2;
-  /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, 
+  /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj,
  /// cutsq_sigma.z = sigma
  UCL_D_Vec<numtyp4> cutsq_sigma;
  /// Special LJ values [0-3] and Special Coul values [4-7]
@ -73,7 +73,7 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;

-  /// Number of atom types 
+  /// Number of atom types
  int _lj_types;

  numtyp _cut_coulsq, _qqrd2e, _g_ewald;
--- a/lib/gpu/lal_born_coul_long_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_ext.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -30,9 +30,9 @@ static BornCoulLong<PRECISION,ACC_PRECISION> BORNCLMF;
 int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                    double **host_born1, double **host_born2, double **host_born3,
                    double **host_a, double **host_c, double **host_d,
-                    double **sigma, double **offset, double *special_lj, 
-                    const int inum, const int nall, const int max_nbors, 
-                    const int maxspecial, const double cell_size, int &gpu_mode, 
+                    double **sigma, double **offset, double *special_lj,
+                    const int inum, const int nall, const int max_nbors,
+                    const int maxspecial, const double cell_size, int &gpu_mode,
                    FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
                    double *host_special_coul, const double qqrd2e,
                    const double g_ewald) {
@ -58,10 +58,10 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,

  int init_ok=0;
  if (world_me==0)
-    init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
-                          host_born3, host_a, host_c, host_d, sigma, offset, 
-                          special_lj, inum, nall, 300, maxspecial, cell_size, 
-                          gpu_split, screen, host_cut_ljsq, host_cut_coulsq, 
+    init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                          host_born3, host_a, host_c, host_d, sigma, offset,
+                          special_lj, inum, nall, 300, maxspecial, cell_size,
+                          gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                          host_special_coul, qqrd2e, g_ewald);

  BORNCLMF.device->world_barrier();
@ -78,14 +78,14 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
-      init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
-                            host_born3, host_a, host_c, host_d, sigma, offset, 
-                            special_lj, inum, nall, 300, maxspecial, cell_size, 
-                            gpu_split, screen, host_cut_ljsq, host_cut_coulsq, 
+      init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                            host_born3, host_a, host_c, host_d, sigma, offset,
+                            special_lj, inum, nall, 300, maxspecial, cell_size,
+                            gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                            host_special_coul, qqrd2e, g_ewald);

    BORNCLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
      fprintf(screen,"Done.\n");
  }
  if (message)
@ -102,7 +102,7 @@ void borncl_gpu_clear() {

 int** borncl_gpu_compute_n(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
                           tagint **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum,  const double cpu_time,
@ -112,8 +112,8 @@ int** borncl_gpu_compute_n(const int ago, const int inum_full,
                          subhi, tag, nspecial, special, eflag, vflag, eatom,
                          vatom, host_start, ilist, jnum, cpu_time, success,
                          host_q, boxlo, prd);
-}  
-			
+}
+
 void borncl_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
--- a/lib/gpu/lal_born_coul_wolf.cpp
+++ b/lib/gpu/lal_born_coul_wolf.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -37,17 +37,17 @@ template <class numtyp, class acctyp>
 BornCoulWolfT::~BornCoulWolfT() {
  clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BornCoulWolfT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }

 template <class numtyp, class acctyp>
-int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
-                        double **host_born1, double **host_born2, double **host_born3, 
-                        double **host_a, double **host_c, double **host_d, 
-                        double **host_sigma, double **host_offset, 
+int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
+                        double **host_born1, double **host_born2, double **host_born3,
+                        double **host_a, double **host_c, double **host_d,
+                        double **host_sigma, double **host_offset,
                        double *host_special_lj, const int nlocal,
                        const int nall, const int max_nbors,
                        const int maxspecial, const double cell_size,
@ -84,12 +84,12 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho

  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		                     host_d,host_offset);
-  
+                                     host_d,host_offset);
+
  cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
                         host_cut_ljsq,host_sigma);
-  
+
  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
    host_write[i]=host_special_lj[i];
@ -144,7 +144,7 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
    vflag=1;
  else
    vflag=0;
-  
+
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -157,17 +157,17 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->atom->q,
-                          &cutsq_sigma, &_cut_coulsq, &_qqrd2e, 
-                          &_alf, &_e_shift, &_f_shift, 
+                          &cutsq_sigma, &_cut_coulsq, &_qqrd2e,
+                          &_alf, &_e_shift, &_f_shift,
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, 
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                   &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                   &nbor_pitch, &this->atom->q,
                   &cutsq_sigma, &_cut_coulsq,
-                   &_qqrd2e, &_alf, &_e_shift, &_f_shift, 
+                   &_qqrd2e, &_alf, &_e_shift, &_f_shift,
                   &this->_threads_per_atom);
  }
  this->time_pair.stop();
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

@ -31,21 +31,21 @@ texture<int2> q_tex;

 #define MY_PIS (acctyp)1.77245385090551602729

-__kernel void k_born_wolf(const __global numtyp4 *restrict x_, 
+__kernel void k_born_wolf(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict coeff1,
-                          const __global numtyp4 *restrict coeff2, 
-                          const int lj_types, 
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
+                          const __global numtyp4 *restrict coeff2,
+                          const int lj_types,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
                          __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
+                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
-                          const int nbor_pitch, 
+                          const int nbor_pitch,
                          const __global numtyp *restrict q_,
-                          const __global numtyp4 *restrict cutsq_sigma, 
+                          const __global numtyp4 *restrict cutsq_sigma,
                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp alf, const numtyp e_shift, 
+                          const numtyp alf, const numtyp e_shift,
                          const numtyp f_shift, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
@ -67,20 +67,20 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

    if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * 
+      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
      e_coul += (acctyp)2.0*e_self;
    }
@ -108,12 +108,12 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
        numtyp forcecoul, forceborn, force, r6inv, prefactor;
        numtyp v_sh = (numtyp)0.0;
        numtyp rexp = (numtyp)0.0;
-        
+
        if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
          numtyp r = ucl_sqrt(rsq);
          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
          r6inv = r2inv*r2inv*r2inv;
-          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
        } else forceborn = (numtyp)0.0;

@ -147,7 +147,7 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
              + coeff2[mtype].z*r2inv*r6inv;
            energy+=factor_lj*(e-coeff2[mtype].w);
-          } 
+          }
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
@ -165,20 +165,20 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
  } // if ii
 }

-__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp4 *restrict coeff1_in,
-                               const __global numtyp4 *restrict coeff2_in, 
+                               const __global numtyp4 *restrict coeff2_in,
                               const __global numtyp *restrict sp_lj_in,
-                               const __global int *dev_nbor, 
+                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans, 
-                               __global acctyp *restrict engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, 
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch,
                               const __global numtyp *restrict q_,
                               const __global numtyp4 *restrict cutsq_sigma,
                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp alf, const numtyp e_shift, 
+                               const numtyp alf, const numtyp e_shift,
                               const numtyp f_shift, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
@ -193,7 +193,7 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
    if (eflag>0)
      coeff2[tid]=coeff2_in[tid];
  }
-  
+
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
@ -201,23 +201,23 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  __syncthreads();
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * 
+      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
      e_coul += (acctyp)2.0*e_self;
    }
@ -244,12 +244,12 @@ __kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
        numtyp forcecoul, forceborn, force, r6inv, prefactor;
        numtyp v_sh = (numtyp)0.0;
        numtyp rexp = (numtyp)0.0;
-        
+
        if (rsq < cutsq_sigma[mtype].y) {
          numtyp r = ucl_sqrt(rsq);
          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
          r6inv = r2inv*r2inv*r2inv;
-          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
        } else forceborn = (numtyp)0.0;

--- a/lib/gpu/lal_born_coul_wolf.h
+++ b/lib/gpu/lal_born_coul_wolf.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -30,19 +30,19 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
-           double **host_born1, double **host_born2, double **host_born3, 
-           double **host_a, double **host_c, double **host_d, 
+  int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
+           double **host_born1, double **host_born2, double **host_born3,
+           double **host_a, double **host_c, double **host_d,
           double **host_sigma, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen, double **host_cut_ljsq,
           const double host_cut_coulsq, double *host_special_coul,
           const double qqrd2e, const double alf, const double e_shift,
@ -60,12 +60,12 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {

  // --------------------------- TYPE DATA --------------------------

-  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, 
+  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
  /// coeff1.w = born3
  UCL_D_Vec<numtyp4> coeff1;
  /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
  UCL_D_Vec<numtyp4> coeff2;
-  /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, 
+  /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj,
  /// cutsq_sigma.z = sigma
  UCL_D_Vec<numtyp4> cutsq_sigma;
  /// Special LJ values [0-3] and Special Coul values [4-7]
@ -74,7 +74,7 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;

-  /// Number of atom types 
+  /// Number of atom types
  int _lj_types;

  numtyp _cut_coulsq,_qqrd2e,_alf,_e_shift,_f_shift;
--- a/lib/gpu/lal_born_coul_wolf_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_ext.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -28,7 +28,7 @@ static BornCoulWolf<PRECISION,ACC_PRECISION> BORNCWMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                    double **host_born1, double **host_born2, double **host_born3, 
+                    double **host_born1, double **host_born2, double **host_born3,
                    double **host_a, double **host_c, double **host_d,
                    double **sigma, double **offset, double *special_lj, const int inum,
                    const int nall, const int max_nbors, const int maxspecial,
@ -60,9 +60,9 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  if (world_me==0)
    init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                          host_born3, host_a, host_c, host_d, sigma,
-                          offset, special_lj, inum, nall, 300, 
+                          offset, special_lj, inum, nall, 300,
                          maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
-                          host_cut_coulsq, host_special_coul, qqrd2e, 
+                          host_cut_coulsq, host_special_coul, qqrd2e,
                          alf, e_shift, f_shift);

  BORNCWMF.device->world_barrier();
@ -79,15 +79,15 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
-      init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
-                            host_born3, host_a, host_c, host_d, sigma, 
-                            offset, special_lj, inum, nall, 300, 
+      init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                            host_born3, host_a, host_c, host_d, sigma,
+                            offset, special_lj, inum, nall, 300,
                            maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
-                            host_cut_coulsq, host_special_coul, qqrd2e, 
+                            host_cut_coulsq, host_special_coul, qqrd2e,
                            alf, e_shift, f_shift);

    BORNCWMF.device->gpu_barrier();
-    if (message) 
+    if (message)
      fprintf(screen,"Done.\n");
  }
  if (message)
@ -104,7 +104,7 @@ void borncw_gpu_clear() {

 int** borncw_gpu_compute_n(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
                           tagint **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum,  const double cpu_time,
@ -114,8 +114,8 @@ int** borncw_gpu_compute_n(const int ago, const int inum_full,
                          subhi, tag, nspecial, special, eflag, vflag, eatom,
                          vatom, host_start, ilist, jnum, cpu_time, success,
                          host_q, boxlo, prd);
-}  
-			
+}
+
 void borncw_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
--- a/lib/gpu/lal_born_ext.cpp
+++ b/lib/gpu/lal_born_ext.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -28,9 +28,9 @@ static Born<PRECISION,ACC_PRECISION> BORNMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                  double **host_born1, double **host_born2, 
-                  double **host_born3, double **host_a, double **host_c, 
-                  double **host_d, double **sigma,      
+                  double **host_born1, double **host_born2,
+                  double **host_born3, double **host_a, double **host_c,
+                  double **host_d, double **sigma,
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors,  const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen) {
@ -56,7 +56,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,

  int init_ok=0;
  if (world_me==0)
-    init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
+    init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                        host_born3, host_a, host_c, host_d, sigma,
                        offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
@ -75,13 +75,13 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
-      init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
-                          host_born3, host_a, host_c, host_d, sigma, 
+      init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                          host_born3, host_a, host_c, host_d, sigma,
                          offset, special_lj, inum, nall, 300,
                          maxspecial, cell_size, gpu_split, screen);

    BORNMF.device->gpu_barrier();
-    if (message) 
+    if (message)
      fprintf(screen,"Done.\n");
  }
  if (message)
@ -102,24 +102,24 @@ void born_gpu_reinit(const int ntypes, double **host_rhoinv,
  int world_me=BORNMF.device->world_me();
  int gpu_rank=BORNMF.device->gpu_rank();
  int procs_per_gpu=BORNMF.device->procs_per_gpu();
-  
+
  if (world_me==0)
    BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2,
                  host_born3, host_a, host_c, host_d, offset);
-  
+
  BORNMF.device->world_barrier();
-  
+
  for (int i=0; i<procs_per_gpu; i++) {
    if (gpu_rank==i && world_me!=0)
      BORNMF.reinit(ntypes, host_rhoinv, host_born1, host_born2,
                    host_born3, host_a, host_c, host_d, offset);
-    
+
    BORNMF.device->gpu_barrier();
  }
 }

 void born_gpu_clear() {
-  BORNMF.clear(); 
+  BORNMF.clear();
 }

 int ** born_gpu_compute_n(const int ago, const int inum_full,
@ -132,8 +132,8 @@ int ** born_gpu_compute_n(const int ago, const int inum_full,
  return BORNMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void born_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
--- a/lib/gpu/lal_buck.cpp
+++ b/lib/gpu/lal_buck.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -33,10 +33,10 @@ BuckT::Buck() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }

 template <class numtyp, class acctyp>
-BuckT::~Buck() { 
+BuckT::~Buck() {
  clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BuckT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
@ -44,11 +44,11 @@ int BuckT::bytes_per_atom(const int max_nbors) const {

 template <class numtyp, class acctyp>
 int BuckT::init(const int ntypes, double **host_cutsq,
-           double **host_rhoinv, double **host_buck1, double **host_buck2, 
-           double **host_a, double **host_c, 
+           double **host_rhoinv, double **host_buck1, double **host_buck2,
+           double **host_a, double **host_c,
           double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@ -79,7 +79,7 @@ int BuckT::init(const int ntypes, double **host_cutsq,

  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		         host_offset);
+                         host_offset);

  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@ -95,14 +95,14 @@ template <class numtyp, class acctyp>
 void BuckT::reinit(const int ntypes, double **host_cutsq,
                   double **host_rhoinv, double **host_buck1, double **host_buck2,
                   double **host_a, double **host_c, double **host_offset) {
-  
+
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                               UCL_WRITE_ONLY);
-  
+
  for (int i=0; i<_lj_types*_lj_types; i++)
    host_write[i]=0.0;
-  
+
  this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
                         host_buck1,host_buck2,host_cutsq);
  this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
@ -143,7 +143,7 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
    vflag=1;
  else
    vflag=0;
-  
+
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -154,13 +154,13 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
-                          &vflag, &ainum, &nbor_pitch, 
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch,
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
                     &ainum, &nbor_pitch, &this->_threads_per_atom);
  }
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

@ -24,15 +24,15 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif

-__kernel void k_buck(const __global numtyp4 *restrict x_, 
+__kernel void k_buck(const __global numtyp4 *restrict x_,
                     const __global numtyp4 *restrict coeff1,
-                     const __global numtyp4 *restrict coeff2, 
-                     const int lj_types, 
+                     const __global numtyp4 *restrict coeff2,
+                     const int lj_types,
                     const __global numtyp *restrict sp_lj_in,
-                     const __global int *dev_nbor, 
+                     const __global int *dev_nbor,
                     const __global int *dev_packed,
                     __global acctyp4 *restrict ans,
-                     __global acctyp *restrict engv, 
+                     __global acctyp *restrict engv,
                     const int eflag,  const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
@ -50,20 +50,20 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
@ -76,24 +76,24 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
      int mtype=itype*lj_types+jtype;
      if (r2inv<coeff1[mtype].w) {
        numtyp r=ucl_sqrt(r2inv);
        numtyp rexp = ucl_exp(-r*coeff1[mtype].x);
        r2inv=ucl_recip(r2inv);
        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
+        numtyp force = r2inv*(coeff1[mtype].y*r*rexp
                - coeff1[mtype].z*r6inv);
        force*=factor_lj;
-      
+
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;

        if (eflag>0) {
          numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
-          energy+=factor_lj*(e-coeff2[mtype].z); 
+          energy+=factor_lj*(e-coeff2[mtype].z);
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
@ -111,19 +111,19 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
  } // if ii
 }

-__kernel void k_buck_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_buck_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict coeff1_in,
-                          const __global numtyp4 *restrict coeff2_in, 
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
+                          const __global numtyp4 *restrict coeff2_in,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
                          __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
-                          const int eflag, const int vflag, const int inum, 
+                          __global acctyp *restrict engv,
+                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
-  
+
  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
@ -134,7 +134,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
    if (eflag>0)
      coeff2[tid]=coeff2_in[tid];
  }
-  
+
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@ -143,7 +143,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;

  __syncthreads();
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
@ -157,7 +157,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
@ -170,13 +170,13 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
      if (r2inv<coeff1[mtype].w) {
        numtyp r=ucl_sqrt(r2inv);
        numtyp rexp = ucl_exp(-r*coeff1[mtype].x);
        r2inv=ucl_recip(r2inv);
        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
+        numtyp force = r2inv*(coeff1[mtype].y*r*rexp
                - coeff1[mtype].z*r6inv);
        force*=factor_lj;

@ -186,7 +186,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,

        if (eflag>0) {
          numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
-          energy+=factor_lj*(e-coeff2[mtype].z); 
+          energy+=factor_lj*(e-coeff2[mtype].z);
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
--- a/lib/gpu/lal_buck.h
+++ b/lib/gpu/lal_buck.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Buck : public BaseAtomic<numtyp, acctyp> {
 public:
  Buck();
-  ~Buck(); 
+  ~Buck();

  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -38,18 +38,18 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq,
-           double **host_rhoinv, double **host_buck1, double **host_buck2, 
-           double **host_a, double **host_c, 
+           double **host_rhoinv, double **host_buck1, double **host_buck2,
+           double **host_a, double **host_c,
           double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen);

  /// Send updated coeffs from host to device (to be compatible with fix adapt)
  void reinit(const int ntypes, double **host_cutsq,
            double **host_rhoinv, double **host_buck1, double **host_buck2,
            double **host_a, double **host_c, double **host_offset);
-  
+
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
@ -72,7 +72,7 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;

-  /// Number of atom types 
+  /// Number of atom types
  int _lj_types;

 private:
--- a/lib/gpu/lal_buck_coul.cpp
+++ b/lib/gpu/lal_buck_coul.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -33,10 +33,10 @@ BuckCoulT::BuckCoul() : BaseCharge<numtyp,acctyp>(), _allocated(false) {
 }

 template <class numtyp, class acctyp>
-BuckCoulT::~BuckCoul() { 
+BuckCoulT::~BuckCoul() {
  clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BuckCoulT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
@ -44,11 +44,11 @@ int BuckCoulT::bytes_per_atom(const int max_nbors) const {

 template <class numtyp, class acctyp>
 int BuckCoulT::init(const int ntypes, double **host_cutsq,
-                   double **host_rhoinv, double **host_buck1, double **host_buck2, 
-                   double **host_a, double **host_c, 
+                   double **host_rhoinv, double **host_buck1, double **host_buck2,
+                   double **host_a, double **host_c,
                   double **host_offset, double *host_special_lj,
-                   const int nlocal, const int nall, const int max_nbors, 
-                   const int maxspecial, const double cell_size, 
+                   const int nlocal, const int nall, const int max_nbors,
+                   const int maxspecial, const double cell_size,
                   const double gpu_split, FILE *_screen, double **host_cut_ljsq,
                   double **host_cut_coulsq, double *host_special_coul,
                   const double qqrd2e) {
@ -81,21 +81,21 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,

  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		         host_offset);
-  
+                         host_offset);
+
  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,cutsq,host_write,host_cutsq,
            host_cut_ljsq, host_cut_coulsq);
-  
+
  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
    host_write[i]=host_special_lj[i];
    host_write[i+4]=host_special_coul[i];
  }
  ucl_copy(sp_lj,host_write,8,false);
-  
+
  _qqrd2e = qqrd2e;
-  
+
  _allocated=true;
  this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()+sp_lj.row_bytes();
  return 0;
@ -135,7 +135,7 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
    vflag=1;
  else
    vflag=0;
-  
+
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -147,12 +147,12 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch, &this->atom->q, 
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
                          &cutsq, &_qqrd2e, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
                     &ainum, &nbor_pitch, &this->atom->q,
                     &cutsq, &_qqrd2e, &this->_threads_per_atom);
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

@ -29,19 +29,19 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif

-__kernel void k_buck_coul(const __global numtyp4 *restrict x_, 
+__kernel void k_buck_coul(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict coeff1,
-                          const __global numtyp4 *restrict coeff2, 
-                          const int lj_types, 
-                          const __global numtyp *restrict sp_lj_in, 
-                          const __global int *dev_nbor, 
-                          const __global int *dev_packed, 
+                          const __global numtyp4 *restrict coeff2,
+                          const int lj_types,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
                          __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
+                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
-                          const int nbor_pitch, 
+                          const int nbor_pitch,
                          const __global numtyp *restrict q_ ,
-                          const __global numtyp4 *restrict cutsq, 
+                          const __global numtyp4 *restrict cutsq,
                          const numtyp qqrd2e, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
@ -63,21 +63,21 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;
-    
+
    for ( ; nbor<nbor_end; nbor+=n_stride) {
      int j=dev_packed[nbor];
-      
+
      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
      factor_coul = sp_lj[sbmask(j)+4];
@ -91,30 +91,30 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
      int mtype=itype*lj_types+jtype;
      if (rsq<cutsq[mtype].x) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp forcecoul, forcebuck, force, r6inv;
        numtyp rexp = (numtyp)0.0;
-        
+
        if (rsq < cutsq[mtype].y) { // buckingham
          numtyp r=ucl_sqrt(rsq);
          rexp = ucl_exp(-r*coeff1[mtype].x);
          r6inv = r2inv*r2inv*r2inv;
-          forcebuck = (coeff1[mtype].y*r*rexp 
+          forcebuck = (coeff1[mtype].y*r*rexp
                  - coeff1[mtype].z*r6inv)*factor_lj;
        } else
          forcebuck = (numtyp)0.0;
-        
+
        if (rsq < coeff2[mtype].z) {
          fetch(forcecoul,j,q_tex);
          forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
        } else
          forcecoul = (numtyp)0.0;
-        
+
        force = (forcebuck + forcecoul) * r2inv;
-        
+
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
@ -142,22 +142,22 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
  } // if ii
 }

-__kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp4 *restrict coeff1_in,
-                               const __global numtyp4 *restrict coeff2_in, 
-                               const __global numtyp *restrict sp_lj_in, 
-                               const __global int *dev_nbor, 
-                               const __global int *dev_packed, 
-                               __global acctyp4 *restrict ans, 
-                               __global acctyp *restrict engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, 
+                               const __global numtyp4 *restrict coeff2_in,
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor,
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch,
                               const __global numtyp *restrict q_,
-                               const __global numtyp4 *restrict _cutsq, 
+                               const __global numtyp4 *restrict _cutsq,
                               const numtyp qqrd2e, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
-  
+
  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@ -170,7 +170,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
    if (eflag>0)
      coeff2[tid]=coeff2_in[tid];
  }
-  
+
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
@ -180,7 +180,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;

  __syncthreads();
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
@ -195,7 +195,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,

    for ( ; nbor<nbor_end; nbor+=n_stride) {
      int j=dev_packed[nbor];
-      
+
      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
      factor_coul = sp_lj[sbmask(j)+4];
@ -209,27 +209,27 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
      if (rsq<cutsq[mtype].x) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp forcecoul, forcebuck, force, r6inv;
        numtyp rexp = (numtyp)0.0;
-        
+
        if (rsq < cutsq[mtype].y) { // buckingham
          numtyp r=ucl_sqrt(rsq);
          rexp = ucl_exp(-r*coeff1[mtype].x);
          r6inv = r2inv*r2inv*r2inv;
-          forcebuck = (coeff1[mtype].y*r*rexp 
+          forcebuck = (coeff1[mtype].y*r*rexp
                  - coeff1[mtype].z*r6inv)*factor_lj;
        } else
          forcebuck = (numtyp)0.0;
-        
+
        if (rsq < cutsq[mtype].z) {
          fetch(forcecoul,j,q_tex);
          forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
        } else
          forcecoul = (numtyp)0.0;
-        
+
        force = (forcebuck + forcecoul) * r2inv;

        f.x+=delx*force;
@ -241,7 +241,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
          if (rsq < cutsq[mtype].y) {
            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
            energy+=factor_lj*(e-coeff2[mtype].z);
-          } 
+          }
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
--- a/lib/gpu/lal_buck_coul.h
+++ b/lib/gpu/lal_buck_coul.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class BuckCoul : public BaseCharge<numtyp, acctyp> {
 public:
  BuckCoul();
-  ~BuckCoul(); 
+  ~BuckCoul();

  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -38,11 +38,11 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq,
-           double **host_rhoinv, double **host_buck1, double **host_buck2, 
-           double **host_a, double **host_c, 
+           double **host_rhoinv, double **host_buck1, double **host_buck2,
+           double **host_a, double **host_c,
           double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen, double **host_cut_ljsq,
           double **host_cut_coulsq, double *host_special_coul,
           const double qqrd2e);
@ -71,11 +71,11 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;

-  /// Number of atom types 
+  /// Number of atom types
  int _lj_types;
-  
+
  numtyp _qqrd2e;
-  
+
 private:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
--- a/lib/gpu/lal_buck_coul_ext.cpp
+++ b/lib/gpu/lal_buck_coul_ext.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -28,8 +28,8 @@ static BuckCoul<PRECISION,ACC_PRECISION> BUCKCMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                 double **host_buck1, double **host_buck2, 
-                 double **host_a, double **host_c,       
+                 double **host_buck1, double **host_buck2,
+                 double **host_a, double **host_c,
                 double **offset, double *special_lj, const int inum,
                 const int nall, const int max_nbors,  const int maxspecial,
                 const double cell_size, int &gpu_mode, FILE *screen,
@ -57,9 +57,9 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,

  int init_ok=0;
  if (world_me==0)
-    init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
+    init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                       host_a, host_c, offset, special_lj, inum, nall, 300,
-                       maxspecial, cell_size, gpu_split, screen, 
+                       maxspecial, cell_size, gpu_split, screen,
                       host_cut_ljsq, host_cut_coulsq,
                       host_special_coul, qqrd2e);

@ -77,14 +77,14 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
-      init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
+      init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                       host_a, host_c, offset, special_lj, inum, nall, 300,
-                       maxspecial, cell_size, gpu_split, screen, 
+                       maxspecial, cell_size, gpu_split, screen,
                       host_cut_ljsq, host_cut_coulsq,
                       host_special_coul, qqrd2e);

    BUCKCMF.device->gpu_barrier();
-    if (message) 
+    if (message)
      fprintf(screen,"Done.\n");
  }
  if (message)
@ -96,12 +96,12 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
 }

 void buckc_gpu_clear() {
-  BUCKCMF.clear(); 
+  BUCKCMF.clear();
 }

 int ** buckc_gpu_compute_n(const int ago, const int inum_full,
                        const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                        double *sublo, double *subhi, tagint *tag, int **nspecial,
                        tagint **special, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        int **ilist, int **jnum, const double cpu_time,
@ -111,8 +111,8 @@ int ** buckc_gpu_compute_n(const int ago, const int inum_full,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success,
                       host_q, boxlo, prd);
-}  
-			
+}
+
 void buckc_gpu_compute(const int ago, const int inum_full, const int nall,
                     double **host_x, int *host_type, int *ilist, int *numj,
                     int **firstneigh, const bool eflag, const bool vflag,
--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 BuckCoulLongT::~BuckCoulLongT() {
  clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int BuckCoulLongT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
@ -45,8 +45,8 @@ int BuckCoulLongT::bytes_per_atom(const int max_nbors) const {

 template <class numtyp, class acctyp>
 int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
-                       double **host_rhoinv, double **host_buck1, double **host_buck2, 
-                       double **host_a, double **host_c, double **host_offset, 
+                       double **host_rhoinv, double **host_buck1, double **host_buck2,
+                       double **host_a, double **host_c, double **host_offset,
                       double *host_special_lj, const int nlocal,
                       const int nall, const int max_nbors,
                       const int maxspecial, const double cell_size,
@ -83,11 +83,11 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,

  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		         host_offset);
-  
+                         host_offset);
+
  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
-  
+
  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
    host_write[i]=host_special_lj[i];
@ -139,7 +139,7 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
    vflag=1;
  else
    vflag=0;
-  
+
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -150,16 +150,16 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &this->ans->force, &this->ans->engv, &eflag,
                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
-                          &cutsq, &_cut_coulsq, &_qqrd2e, 
+                          &cutsq, &_cut_coulsq, &_qqrd2e,
                          &_g_ewald, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &coeff1,  &coeff2, &_lj_types, &sp_lj, 
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &coeff1,  &coeff2, &_lj_types, &sp_lj,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                   &this->ans->force, &this->ans->engv, &eflag, &vflag,
-                   &ainum, &nbor_pitch, &this->atom->q, &cutsq, 
+                   &ainum, &nbor_pitch, &this->atom->q, &cutsq,
                   &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
  }
  this->time_pair.stop();
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

@ -29,19 +29,19 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif

-__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, 
+__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
                               const __global numtyp4 *restrict coeff1,
-                               const __global numtyp4 *restrict coeff2, 
-                               const int lj_types, 
-                               const __global numtyp *restrict sp_lj_in, 
-                               const __global int *dev_nbor, 
-                               const __global int *dev_packed, 
+                               const __global numtyp4 *restrict coeff2,
+                               const int lj_types,
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor,
+                               const __global int *dev_packed,
                               __global acctyp4 *restrict ans,
-                               __global acctyp *restrict engv, 
+                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
-                               const int nbor_pitch, 
+                               const int nbor_pitch,
                               const __global numtyp *restrict q_,
-                               const __global numtyp *restrict cutsq, 
+                               const __global numtyp *restrict cutsq,
                               const numtyp cut_coulsq, const numtyp qqrd2e,
                               const numtyp g_ewald, const int t_per_atom) {
  int tid, ii, offset;
@ -64,14 +64,14 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;
@ -98,136 +98,136 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
        numtyp r2inv=ucl_recip(rsq);
        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
        numtyp rexp = (numtyp)0.0;
-        
+
        if (rsq < coeff1[mtype].w) { // cut_ljsq
          numtyp r=ucl_sqrt(rsq);
          rexp = ucl_exp(-r*coeff1[mtype].x);
          r6inv = r2inv*r2inv*r2inv;
-          force_lj = (coeff1[mtype].y*r*rexp 
-                  - coeff1[mtype].z*r6inv)*factor_lj;
-        } else
-          force_lj = (numtyp)0.0;
-
-        if (rsq < cut_coulsq) {
-          numtyp r = ucl_rsqrt(r2inv);
-          numtyp grij = g_ewald * r;
-          numtyp expm2 = ucl_exp(-grij*grij);
-          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-          fetch(prefactor,j,q_tex);
-          prefactor *= qqrd2e * qtmp/r;
-          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else
-          forcecoul = (numtyp)0.0;
-
-        force = (force_lj + forcecoul) * r2inv;
-
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
-
-        if (eflag>0) {
-          if (rsq < cut_coulsq)
-            e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < coeff1[mtype].w) {
-            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
-            energy+=factor_lj*(e-coeff2[mtype].z);
-          } 
-        }
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
-  } // if ii
-}
-
-__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_, 
-                                    const __global numtyp4 *restrict coeff1_in,
-                                    const __global numtyp4 *restrict coeff2_in, 
-                                    const __global numtyp *restrict sp_lj_in,
-                                    const __global int *dev_nbor, 
-                                    const __global int *dev_packed,
-                                    __global acctyp4 *restrict ans,
-                                    __global acctyp *restrict engv, 
-                                    const int eflag, const int vflag, 
-                                    const int inum, const int nbor_pitch,
-                                    const __global numtyp *restrict q_, 
-                                    const __global numtyp *restrict cutsq,
-                                    const numtyp cut_coulsq, 
-                                    const numtyp qqrd2e, const numtyp g_ewald, 
-                                    const int t_per_atom) {
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[8];
-  if (tid<8)
-    sp_lj[tid]=sp_lj_in[tid];
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
-      coeff2[tid]=coeff2_in[tid];
-  }
-  
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-  
-  __syncthreads();
-  
-  if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    __local int n_stride;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-  
-    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    numtyp qtmp; fetch(qtmp,i,q_tex);
-    int iw=ix.w;
-    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-      int j=dev_packed[nbor];
-
-      numtyp factor_lj, factor_coul;
-      factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
-      j &= NEIGHMASK;
-
-      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq<cutsq[mtype]) {
-        numtyp r2inv=ucl_recip(rsq);
-        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
-        numtyp rexp = (numtyp)0.0;
-        
-        if (rsq < coeff1[mtype].w) {
-          numtyp r=ucl_sqrt(rsq);
-          rexp = ucl_exp(-r*coeff1[mtype].x);
-          r6inv = r2inv*r2inv*r2inv;
-          force_lj = (coeff1[mtype].y*r*rexp 
+          force_lj = (coeff1[mtype].y*r*rexp
+                  - coeff1[mtype].z*r6inv)*factor_lj;
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < coeff1[mtype].w) {
+            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
+            energy+=factor_lj*(e-coeff2[mtype].z);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
+                                    const __global numtyp4 *restrict coeff1_in,
+                                    const __global numtyp4 *restrict coeff2_in,
+                                    const __global numtyp *restrict sp_lj_in,
+                                    const __global int *dev_nbor,
+                                    const __global int *dev_packed,
+                                    __global acctyp4 *restrict ans,
+                                    __global acctyp *restrict engv,
+                                    const int eflag, const int vflag,
+                                    const int inum, const int nbor_pitch,
+                                    const __global numtyp *restrict q_,
+                                    const __global numtyp *restrict cutsq,
+                                    const numtyp cut_coulsq,
+                                    const numtyp qqrd2e, const numtyp g_ewald,
+                                    const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    coeff1[tid]=coeff1_in[tid];
+    if (eflag>0)
+      coeff2[tid]=coeff2_in[tid];
+  }
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    __local int n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq[mtype]) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
+        numtyp rexp = (numtyp)0.0;
+
+        if (rsq < coeff1[mtype].w) {
+          numtyp r=ucl_sqrt(rsq);
+          rexp = ucl_exp(-r*coeff1[mtype].x);
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = (coeff1[mtype].y*r*rexp
                  - coeff1[mtype].z*r6inv)*factor_lj;
        } else
          force_lj = (numtyp)0.0;
--- a/lib/gpu/lal_buck_coul_long.h
+++ b/lib/gpu/lal_buck_coul_long.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -30,7 +30,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -38,11 +38,11 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq,
-           double **host_rhoinv, double **host_buck1, double **host_buck2, 
-           double **host_a, double **host_c, 
+           double **host_rhoinv, double **host_buck1, double **host_buck2,
+           double **host_a, double **host_c,
           double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen, double **host_cut_ljsq,
           const double host_cut_coulsq, double *host_special_coul,
           const double qqrd2e, const double g_ewald);
@ -71,7 +71,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;

-  /// Number of atom types 
+  /// Number of atom types
  int _lj_types;

  numtyp _cut_coulsq, _qqrd2e, _g_ewald;
--- a/lib/gpu/lal_buck_coul_long_ext.cpp
+++ b/lib/gpu/lal_buck_coul_long_ext.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -28,7 +28,7 @@ static BuckCoulLong<PRECISION,ACC_PRECISION> BUCKCLMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                 double **host_buck1, double **host_buck2, 
+                 double **host_buck1, double **host_buck2,
                 double **host_a, double **host_c,
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors, const int maxspecial,
@ -58,8 +58,8 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,

  int init_ok=0;
  if (world_me==0)
-    init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
-                        host_a, host_c, offset, special_lj, inum, nall, 300, 
+    init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
+                        host_a, host_c, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);

@ -77,13 +77,13 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
-      init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
-                        host_a, host_c, offset, special_lj, inum, nall, 300, 
+      init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
+                        host_a, host_c, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);

    BUCKCLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
      fprintf(screen,"Done.\n");
  }
  if (message)
@ -100,7 +100,7 @@ void buckcl_gpu_clear() {

 int** buckcl_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum,  const double cpu_time,
@ -110,8 +110,8 @@ int** buckcl_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
-}  
-			
+}
+
 void buckcl_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
--- a/lib/gpu/lal_buck_ext.cpp
+++ b/lib/gpu/lal_buck_ext.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -28,8 +28,8 @@ static Buck<PRECISION,ACC_PRECISION> BUCKMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                 double **host_buck1, double **host_buck2, 
-                 double **host_a, double **host_c,       
+                 double **host_buck1, double **host_buck2,
+                 double **host_a, double **host_c,
                 double **offset, double *special_lj, const int inum,
                 const int nall, const int max_nbors,  const int maxspecial,
                 const double cell_size, int &gpu_mode, FILE *screen) {
@ -55,7 +55,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,

  int init_ok=0;
  if (world_me==0)
-    init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
+    init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                       host_a, host_c, offset, special_lj, inum, nall, 300,
                       maxspecial, cell_size, gpu_split, screen);

@ -73,12 +73,12 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
-      init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
+      init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                       host_a, host_c, offset, special_lj, inum, nall, 300,
                       maxspecial, cell_size, gpu_split, screen);

    BUCKMF.device->gpu_barrier();
-    if (message) 
+    if (message)
      fprintf(screen,"Done.\n");
  }
  if (message)
@ -98,24 +98,24 @@ void buck_gpu_reinit(const int ntypes, double **cutsq, double **host_rhoinv,
  int world_me=BUCKMF.device->world_me();
  int gpu_rank=BUCKMF.device->gpu_rank();
  int procs_per_gpu=BUCKMF.device->procs_per_gpu();
-  
+
  if (world_me==0)
    BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                  host_a, host_c, offset);
-  
+
  BUCKMF.device->world_barrier();

  for (int i=0; i<procs_per_gpu; i++) {
    if (gpu_rank==i && world_me!=0)
      BUCKMF.reinit(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
                    host_a, host_c, offset);
-    
+
    BUCKMF.device->gpu_barrier();
  }
 }

 void buck_gpu_clear() {
-  BUCKMF.clear(); 
+  BUCKMF.clear();
 }

 int ** buck_gpu_compute_n(const int ago, const int inum_full,
@ -128,8 +128,8 @@ int ** buck_gpu_compute_n(const int ago, const int inum_full,
  return BUCKMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void buck_gpu_compute(const int ago, const int inum_full, const int nall,
                     double **host_x, int *host_type, int *ilist, int *numj,
                     int **firstneigh, const bool eflag, const bool vflag,
--- a/lib/gpu/lal_cg_cmm.cpp
+++ b/lib/gpu/lal_cg_cmm.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -33,23 +33,23 @@ CGCMMT::CGCMM() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }

 template <class numtyp, class acctyp>
-CGCMMT::~CGCMM() { 
+CGCMMT::~CGCMM() {
  clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CGCMMT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }

 template <class numtyp, class acctyp>
-int CGCMMT::init(const int ntypes, double **host_cutsq, 
-                          int **host_cg_type, double **host_lj1, 
-                          double **host_lj2, double **host_lj3, 
-                          double **host_lj4, double **host_offset, 
+int CGCMMT::init(const int ntypes, double **host_cutsq,
+                          int **host_cg_type, double **host_lj1,
+                          double **host_lj2, double **host_lj3,
+                          double **host_lj4, double **host_offset,
                          double *host_special_lj, const int nlocal,
                          const int nall, const int max_nbors,
-                          const int maxspecial, const double cell_size, 
+                          const int maxspecial, const double cell_size,
                          const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@ -75,12 +75,12 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,
    host_write[i]=0.0;

  lj1.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq, 
+  this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq,
                         host_cg_type,host_lj1,host_lj2);

  lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);

  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@ -126,7 +126,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
    vflag=1;
  else
    vflag=0;
-  
+
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -138,7 +138,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch,  
+                          &vflag, &ainum, &nbor_pitch,
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
--- a/lib/gpu/lal_cg_cmm.cu
+++ b/lib/gpu/lal_cg_cmm.cu
@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

@ -24,15 +24,15 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif

-__kernel void k_cg_cmm(const __global numtyp4 *restrict x_, 
+__kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
                       const __global numtyp4 *restrict lj1,
-                       const __global numtyp4 *restrict lj3, 
-                       const int lj_types, 
+                       const __global numtyp4 *restrict lj3,
+                       const int lj_types,
                       const __global numtyp *restrict sp_lj_in,
-                       const __global int *dev_nbor, 
+                       const __global int *dev_nbor,
                       const __global int *dev_packed,
                       __global acctyp4 *restrict ans,
-                       __global acctyp *restrict engv, 
+                       __global acctyp *restrict engv,
                       const int eflag, const int vflag, const int inum,
                       const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
@ -50,20 +50,20 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
@ -76,12 +76,12 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
      int mtype=itype*lj_types+jtype;
      if (r2inv<lj1[mtype].x) {
        r2inv=ucl_recip(r2inv);
        numtyp inv1,inv2;
-        
+
        if (lj1[mtype].y == 2) {
          inv1=r2inv*r2inv;
          inv2=inv1*inv1;
@ -93,7 +93,7 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
          inv2=inv1;
        }
        numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
-      
+
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
@ -116,9 +116,9 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
  } // if ii
 }

-__kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp4 *restrict lj1_in,
-                            const __global numtyp4 *restrict lj3_in, 
+                            const __global numtyp4 *restrict lj3_in,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
@ -139,30 +139,30 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
    if (eflag>0)
      lj3[tid]=lj3_in[tid];
  }
-  
+
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  __syncthreads();
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
@ -175,11 +175,11 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
+
      if (r2inv<lj1[mtype].x) {
        r2inv=ucl_recip(r2inv);
        numtyp inv1,inv2;
-        
+
        if (lj1[mtype].y == (numtyp)2) {
          inv1=r2inv*r2inv;
          inv2=inv1*inv1;
@ -191,7 +191,7 @@ __kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
          inv2=inv1;
        }
        numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
-      
+
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
--- a/lib/gpu/lal_cg_cmm.h
+++ b/lib/gpu/lal_cg_cmm.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class CGCMM : public BaseAtomic<numtyp, acctyp> {
 public:
  CGCMM();
-  ~CGCMM(); 
+  ~CGCMM();

  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -40,7 +40,7 @@ class CGCMM : public BaseAtomic<numtyp, acctyp> {
  int init(const int ntypes, double **host_cutsq, int **host_cg_type,
           double **host_lj1, double **host_lj2, double **host_lj3,
           double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen);

@ -66,7 +66,7 @@ class CGCMM : public BaseAtomic<numtyp, acctyp> {
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;

-  /// Number of atom types 
+  /// Number of atom types
  int _cmm_types;

 private:
--- a/lib/gpu/lal_cg_cmm_ext.cpp
+++ b/lib/gpu/lal_cg_cmm_ext.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -28,9 +28,9 @@ static CGCMM<PRECISION,ACC_PRECISION> CMMMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
-                 double **host_lj1, double **host_lj2, double **host_lj3, 
+                 double **host_lj1, double **host_lj2, double **host_lj3,
                 double **host_lj4, double **offset, double *special_lj,
-                 const int inum, const int nall, const int max_nbors, 
+                 const int inum, const int nall, const int max_nbors,
                 const int maxspecial, const double cell_size, int &gpu_mode,
                 FILE *screen) {
  CMMMF.clear();
@ -55,7 +55,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,

  int init_ok=0;
  if (world_me==0)
-    init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, 
+    init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
                       host_lj4, offset, special_lj, inum, nall, 300,
                       maxspecial, cell_size, gpu_split, screen);

@ -78,7 +78,7 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
                         maxspecial, cell_size, gpu_split, screen);

    CMMMF.device->gpu_barrier();
-    if (message) 
+    if (message)
      fprintf(screen,"Done.\n");
  }
  if (message)
@ -103,8 +103,8 @@ int** cmm_gpu_compute_n(const int ago, const int inum_full,
  return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
                     double **host_x, int *host_type, int *ilist, int *numj,
                     int **firstneigh, const bool eflag, const bool vflag,
--- a/lib/gpu/lal_cg_cmm_long.cpp
+++ b/lib/gpu/lal_cg_cmm_long.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -37,22 +37,22 @@ template <class numtyp, class acctyp>
 CGCMMLongT::~CGCMMLong() {
  clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CGCMMLongT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }

 template <class numtyp, class acctyp>
-int CGCMMLongT::init(const int ntypes, double **host_cutsq, 
-                           int **host_cg_type, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
+int CGCMMLongT::init(const int ntypes, double **host_cutsq,
+                           int **host_cg_type, double **host_lj1,
+                           double **host_lj2, double **host_lj3,
+                           double **host_lj4, double **host_offset,
                           double *host_special_lj, const int nlocal,
                           const int nall, const int max_nbors,
                           const int maxspecial, const double cell_size,
                           const double gpu_split, FILE *_screen,
-                           double **host_cut_ljsq, 
+                           double **host_cut_ljsq,
                           const double host_cut_coulsq,
                           double *host_special_coul, const double qqrd2e,
                           const double g_ewald) {
@ -137,7 +137,7 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
    vflag=1;
  else
    vflag=0;
-  
+
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -149,13 +149,13 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag,
-                          &vflag, &ainum, &nbor_pitch, &this->atom->q, 
-                          &_cut_coulsq, &_qqrd2e, &_g_ewald, 
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                          &_cut_coulsq, &_qqrd2e, &_g_ewald,
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
                     &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq,
                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
--- a/lib/gpu/lal_cg_cmm_long.cu
+++ b/lib/gpu/lal_cg_cmm_long.cu
@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

@ -29,12 +29,12 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif

-__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, 
+__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
                            const __global numtyp4 *restrict lj1,
-                            const __global numtyp4 *restrict lj3, 
-                            const int lj_types, 
+                            const __global numtyp4 *restrict lj3,
+                            const int lj_types,
                            const __global numtyp *restrict sp_lj_in,
-                            const __global int *dev_nbor, 
+                            const __global int *dev_nbor,
                            const __global int *dev_packed,
                            __global acctyp4 *restrict ans,
                            __global acctyp *restrict engv,
@ -70,7 +70,7 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;
@ -136,7 +136,7 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
          if (rsq < lj1[mtype].y) {
            energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
                      lj3[mtype].w;
-          } 
+          }
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
@ -154,17 +154,17 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
  } // if ii
 }

-__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
                                 const __global numtyp4 *restrict lj1_in,
-                                 const __global numtyp4 *restrict lj3_in, 
-                                 const __global numtyp *restrict sp_lj_in, 
-                                 const __global int *dev_nbor, 
+                                 const __global numtyp4 *restrict lj3_in,
+                                 const __global numtyp *restrict sp_lj_in,
+                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 __global acctyp4 *restrict ans,
-                                 __global acctyp *restrict engv, 
-                                 const int eflag, const int vflag, 
+                                 __global acctyp *restrict engv,
+                                 const int eflag, const int vflag,
                                 const int inum, const int nbor_pitch,
-                                 const __global numtyp *restrict q_, 
+                                 const __global numtyp *restrict q_,
                                 const numtyp cut_coulsq, const numtyp qqrd2e,
                                 const numtyp g_ewald, const int t_per_atom) {
  int tid, ii, offset;
@ -179,7 +179,7 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
    lj1[tid]=lj1_in[tid];
    lj3[tid]=lj3_in[tid];
  }
-  
+
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
@ -187,16 +187,16 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  __syncthreads();
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
@ -262,7 +262,7 @@ __kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
          if (rsq < lj1[mtype].y) {
            energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
                      lj3[mtype].w;
-          } 
+          }
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
--- a/lib/gpu/lal_cg_cmm_long.h
+++ b/lib/gpu/lal_cg_cmm_long.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -30,7 +30,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -40,8 +40,8 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
  int init(const int ntypes, double **host_cutsq, int ** cg_type,
           double **host_lj1, double **host_lj2, double **host_lj3,
           double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen, double **host_cut_ljsq,
           const double host_cut_coulsq, double *host_special_coul,
           const double qqrd2e, const double g_ewald);
@ -58,7 +58,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {

  // --------------------------- TYPE DATA --------------------------

-  /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2, 
+  /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2,
  UCL_D_Vec<numtyp4> lj1;
  /// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset
  UCL_D_Vec<numtyp4> lj3;
@ -68,7 +68,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;

-  /// Number of atom types 
+  /// Number of atom types
  int _lj_types;

  numtyp _cut_coulsq, _qqrd2e, _g_ewald;
--- a/lib/gpu/lal_cg_cmm_long_ext.cpp
+++ b/lib/gpu/lal_cg_cmm_long_ext.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -28,9 +28,9 @@ static CGCMMLong<PRECISION,ACC_PRECISION> CMMLMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
+                  double **host_lj1, double **host_lj2, double **host_lj3,
                  double **host_lj4, double **offset, double *special_lj,
-                  const int inum, const int nall, const int max_nbors, 
+                  const int inum, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size, int &gpu_mode,
                  FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
                  double *host_special_coul, const double qqrd2e,
@ -58,7 +58,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
  int init_ok=0;
  if (world_me==0)
    init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
-                        host_lj4, offset, special_lj, inum, nall, 300, 
+                        host_lj4, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);

@ -82,7 +82,7 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
                          qqrd2e, g_ewald);
    CMMLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
      fprintf(screen,"Done.\n");
  }
  if (message)
@ -99,7 +99,7 @@ void cmml_gpu_clear() {

 int** cmml_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
@ -109,8 +109,8 @@ int** cmml_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q,boxlo,prd);
-}  
-			
+}
+
 void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 CHARMMLongT::~CHARMMLong() {
  clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
@ -45,9 +45,9 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const {

 template <class numtyp, class acctyp>
 int CHARMMLongT::init(const int ntypes,
-                           double host_cut_bothsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
+                           double host_cut_bothsq, double **host_lj1,
+                           double **host_lj2, double **host_lj3,
+                           double **host_lj4, double **host_offset,
                           double *host_special_lj, const int nlocal,
                           const int nall, const int max_nbors,
                           const int maxspecial, const double cell_size,
@ -144,7 +144,7 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
    vflag=1;
  else
    vflag=0;
-  
+
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -153,17 +153,17 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj, 
+    this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag,
                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
                          &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
-                          &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, 
+                          &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+    this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
                     &ainum, &nbor_pitch, &this->atom->q,
                     &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : brownw@ornl.gov
 // ***************************************************************************/

@ -31,14 +31,14 @@ texture<int2> q_tex;

 __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
                            const __global numtyp4 *restrict lj1,
-                            const int lj_types, 
+                            const int lj_types,
                            const __global numtyp *restrict sp_lj,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
                            __global acctyp4 *restrict ans,
-                            __global acctyp *restrict engv, 
-                            const int eflag, const int vflag, const int inum, 
-                            const int nbor_pitch, 
+                            __global acctyp *restrict engv,
+                            const int eflag, const int vflag, const int inum,
+                            const int nbor_pitch,
                            const __global numtyp *restrict q_,
                            const numtyp cut_coulsq, const numtyp qqrd2e,
                            const numtyp g_ewald, const numtyp denom_lj,
@ -61,7 +61,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;
@ -93,7 +93,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
          if (rsq > cut_lj_innersq) {
            switch1 = (cut_ljsq-rsq);
-            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ 
+            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/
                             denom_lj;
            switch1 *= switch1;
            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
@ -130,7 +130,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
            if (rsq > cut_lj_innersq)
              e *= switch1;
            energy+=factor_lj*e;
-          } 
+          }
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
@ -148,19 +148,19 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
  } // if ii
 }

-__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
                                 const __global numtyp2 *restrict ljd_in,
-                                 const __global numtyp *restrict sp_lj_in, 
-                                 const __global int *dev_nbor, 
-                                 const __global int *dev_packed, 
+                                 const __global numtyp *restrict sp_lj_in,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
                                 __global acctyp4 *restrict ans,
-                                 __global acctyp *restrict engv, 
-                                 const int eflag, const int vflag, 
-                                 const int inum, const int nbor_pitch, 
+                                 __global acctyp *restrict engv,
+                                 const int eflag, const int vflag,
+                                 const int inum, const int nbor_pitch,
                                 const __global numtyp *restrict q_,
                                 const numtyp cut_coulsq, const numtyp qqrd2e,
                                 const numtyp g_ewald, const numtyp denom_lj,
-                                 const numtyp cut_bothsq, const numtyp cut_ljsq, 
+                                 const numtyp cut_bothsq, const numtyp cut_ljsq,
                                 const numtyp cut_lj_innersq,
                                 const int t_per_atom) {
  int tid, ii, offset;
@ -174,7 +174,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
    ljd[tid]=ljd_in[tid];
  if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
    ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
-  
+
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
@ -182,16 +182,16 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  __syncthreads();
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;
@ -229,7 +229,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
          force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
          if (rsq > cut_lj_innersq) {
            switch1 = (cut_ljsq-rsq);
-            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/ 
+            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)/
                             denom_lj;
            switch1 *= switch1;
            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)/
--- a/lib/gpu/lal_charmm_long.h
+++ b/lib/gpu/lal_charmm_long.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -30,7 +30,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -40,12 +40,12 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
  int init(const int ntypes, double host_cut_bothsq,
           double **host_lj1, double **host_lj2, double **host_lj3,
           double **host_lj4, double **host_offset, double *host_special_lj,
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen, double host_cut_ljsq,
           const double host_cut_coulsq, double *host_special_coul,
           const double qqrd2e, const double g_ewald,
-           const double cut_lj_innersq, const double denom_lj, 
+           const double cut_lj_innersq, const double denom_lj,
           double **epsilon, double **sigma, const bool mix_arithmetic);

  /// Clear all host and device data
@ -70,7 +70,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;

-  /// Number of atom types 
+  /// Number of atom types
  int _lj_types;

  numtyp _qqrd2e, _g_ewald, _denom_lj;
--- a/lib/gpu/lal_charmm_long_ext.cpp
+++ b/lib/gpu/lal_charmm_long_ext.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : brownw@ornl.gov
 ***************************************************************************/

@ -87,7 +87,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
                          sigma, mix_arithmetic);

    CRMLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
      fprintf(screen,"Done.\n");
  }
  if (message)
@ -104,7 +104,7 @@ void crml_gpu_clear() {

 int** crml_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial, 
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
@ -114,14 +114,14 @@ int** crml_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
-}  
-			
+}
+
 void crml_gpu_compute(const int ago, const int inum_full,
-	 	                  const int nall, double **host_x, int *host_type,
+                                   const int nall, double **host_x, int *host_type,
                      int *ilist, int *numj, int **firstneigh,
-		                  const bool eflag, const bool vflag, const bool eatom,
+                                  const bool eflag, const bool vflag, const bool eatom,
                      const bool vatom, int &host_start, const double cpu_time,
-                      bool &success, double *host_q, const int nlocal, 
+                      bool &success, double *host_q, const int nlocal,
                      double *boxlo, double *prd) {
  CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
                 eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
--- a/lib/gpu/lal_colloid.cpp
+++ b/lib/gpu/lal_colloid.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -33,23 +33,23 @@ ColloidT::Colloid() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }

 template <class numtyp, class acctyp>
-ColloidT::~Colloid() { 
+ColloidT::~Colloid() {
  clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int ColloidT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }

 template <class numtyp, class acctyp>
-int ColloidT::init(const int ntypes, 
-                   double **host_cutsq, double **host_lj1, 
-                   double **host_lj2, double **host_lj3, 
-                   double **host_lj4, double **host_offset, 
-                   double *host_special_lj, double **host_a12, 
-                   double **host_a1, double **host_a2, 
-                   double **host_d1, double **host_d2, 
+int ColloidT::init(const int ntypes,
+                   double **host_cutsq, double **host_lj1,
+                   double **host_lj2, double **host_lj3,
+                   double **host_lj4, double **host_offset,
+                   double *host_special_lj, double **host_a12,
+                   double **host_a1, double **host_a2,
+                   double **host_d1, double **host_d2,
                   double **host_sigma3, double **host_sigma6,
                   int **host_form, const int nlocal,
                   const int nall, const int max_nbors,
@ -97,7 +97,7 @@ int ColloidT::init(const int ntypes,
  UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device),
                             UCL_WRITE_ONLY);
  for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0;
-                                
+
  form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<ntypes; i++)
    for (int j=0; j<ntypes; j++) {
@ -153,7 +153,7 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
    vflag=1;
  else
    vflag=0;
-  
+
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -170,9 +170,9 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
-                     &colloid1, &colloid2, &form, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, &eflag, &vflag, 
+                     &colloid1, &colloid2, &form,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
                     &ainum, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/

@ -24,18 +24,18 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif

-__kernel void k_colloid(const __global numtyp4 *restrict x_, 
+__kernel void k_colloid(const __global numtyp4 *restrict x_,
                        const __global numtyp4 *restrict lj1,
-                        const __global numtyp4 *restrict lj3, 
-                        const int lj_types, 
-                        const __global numtyp *restrict sp_lj_in, 
-                        const __global numtyp4 *restrict colloid1, 
+                        const __global numtyp4 *restrict lj3,
+                        const int lj_types,
+                        const __global numtyp *restrict sp_lj_in,
+                        const __global numtyp4 *restrict colloid1,
                        const __global numtyp4 *restrict colloid2,
-                        const __global int *form, 
-                        const __global int *dev_nbor, 
-                        const __global int *dev_packed, 
+                        const __global int *form,
+                        const __global int *dev_nbor,
+                        const __global int *dev_packed,
                        __global acctyp4 *restrict ans,
-                        __global acctyp *restrict engv, 
+                        __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
@ -53,20 +53,20 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
@ -79,21 +79,21 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
      int mtype=itype*lj_types+jtype;
-      if (rsq<lj1[mtype].z) {   
+      if (rsq<lj1[mtype].z) {
        numtyp r,r2inv,r6inv;
        numtyp c1,c2,fR,evdwl;
        numtyp K[9],h[4],g[4];
        numtyp force = (numtyp)0;
- 
+
        if (form[mtype]==0) { // SMALL_SMALL
          r2inv=ucl_recip(rsq);
          r6inv = r2inv*r2inv*r2inv;
          force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
          force*=factor_lj;
        } else if (form[mtype]==1) { // SMALL_LARGE
-          c2 = colloid1[mtype].z; 
+          c2 = colloid1[mtype].z;
          K[1] = c2*c2;
          K[2] = rsq;
          K[0] = K[1] - rsq;
@ -102,15 +102,15 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
          K[3] *= K[3]*K[3];
          K[6] = K[3]*K[3];
          fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
-          force = (numtyp)4.0/(numtyp)15.0*fR * 
-             ((numtyp)2.0*(K[1]+K[2]) * 
-             (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) * 
+          force = (numtyp)4.0/(numtyp)15.0*fR *
+             ((numtyp)2.0*(K[1]+K[2]) *
+             (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
             colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
          force*=factor_lj;
        } else if (form[mtype]==2) { // LARGE_LARGE
          r = ucl_sqrt(rsq);
-          c1 = colloid1[mtype].y; 
-          c2 = colloid1[mtype].z; 
+          c1 = colloid1[mtype].y;
+          c2 = colloid1[mtype].z;
          K[0] = c1*c2;
          K[1] = c1+c2;
          K[2] = c1-c2;
@ -132,16 +132,16 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
          g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4];
          g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5];
          g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6];
-	
+
          fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
          evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
          numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
          numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*
-                       (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
+                       (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
                       ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
          force = factor_lj * (dUR+dUA)/r;
        }
-  
+
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
@ -151,14 +151,14 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
          if (form[mtype]==0) {
            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
          } else if (form[mtype]==1) {
-            e=(numtyp)2.0/(numtyp)9.0*fR * 
-              ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) +  
+            e=(numtyp)2.0/(numtyp)9.0*fR *
+              ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) +
              (numtyp)4.2*K[4])+K[2]*K[4]) * colloid2[mtype].w/K[6]);
          } else if (form[mtype]==2) {
-            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * 
+            e=evdwl+colloid1[mtype].x/(numtyp)6.0 *
              ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
-          } 
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          }
+          energy+=factor_lj*(e-lj3[mtype].z);
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
@ -176,22 +176,22 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
  } // if ii
 }

-__kernel void k_colloid_fast(const __global numtyp4 *restrict x_, 
+__kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
                             const __global numtyp4 *restrict lj1_in,
-                             const __global numtyp4 *restrict lj3_in, 
+                             const __global numtyp4 *restrict lj3_in,
                             const __global numtyp *restrict sp_lj_in,
-                             const __global numtyp4 *restrict colloid1_in, 
+                             const __global numtyp4 *restrict colloid1_in,
                             const __global numtyp4 *restrict colloid2_in,
-                             const __global int *form_in, 
-                             const __global int *dev_nbor, 
-                             const __global int *dev_packed, 
-                             __global acctyp4 *restrict ans, 
-                             __global acctyp *restrict engv, 
-                             const int eflag, const int vflag, const int inum, 
+                             const __global int *form_in,
+                             const __global int *dev_nbor,
+                             const __global int *dev_packed,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv,
+                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
-  
+
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 colloid1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@ -208,7 +208,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
    if (eflag>0)
      lj3[tid]=lj3_in[tid];
  }
-  
+
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@ -217,7 +217,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;

  __syncthreads();
-  
+
  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
@ -231,7 +231,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
@ -244,20 +244,20 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
      if (rsq<lj1[mtype].z) {
        numtyp r,r2inv,r6inv;
        numtyp c1,c2,fR,evdwl;
        numtyp K[9],h[4],g[4];
        numtyp force = (numtyp)0;
- 
+
        if (form[mtype]==0) { // SMALL_SMALL
          r2inv=ucl_recip(rsq);
          r6inv = r2inv*r2inv*r2inv;
          force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
          force*=factor_lj;
        } else if (form[mtype]==1) { // SMALL_LARGE
-          c2 = colloid1[mtype].z; 
+          c2 = colloid1[mtype].z;
          K[1] = c2*c2;
          K[2] = rsq;
          K[0] = K[1] - rsq;
@ -266,15 +266,15 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
          K[3] *= K[3]*K[3];
          K[6] = K[3]*K[3];
          fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
-          force = (numtyp)4.0/(numtyp)15.0*fR * 
-            ((numtyp)2.0*(K[1]+K[2]) * 
-            (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) * 
+          force = (numtyp)4.0/(numtyp)15.0*fR *
+            ((numtyp)2.0*(K[1]+K[2]) *
+            (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
            colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
          force*=factor_lj;
        } else if (form[mtype]==2) { // LARGE_LARGE
          r = ucl_sqrt(rsq);
-          c1 = colloid1[mtype].y; 
-          c2 = colloid1[mtype].z; 
+          c1 = colloid1[mtype].y;
+          c2 = colloid1[mtype].z;
          K[0] = c1*c2;
          K[1] = c1+c2;
          K[2] = c1-c2;
@ -296,16 +296,16 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
          g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4];
          g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5];
          g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6];
-	
+
          fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
          evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
          numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
          numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*
-            (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
+            (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
            ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
          force = factor_lj * (dUR+dUA)/r;
        } else force = (numtyp)0.0;
-      
+
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
@ -315,15 +315,15 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
          if (form[mtype]==0) {
            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
          } else if (form[mtype]==1) {
-            e=(numtyp)2.0/(numtyp)9.0*fR * 
+            e=(numtyp)2.0/(numtyp)9.0*fR *
              ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+
              (numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])*
              colloid2[mtype].w/K[6]);
          } else if (form[mtype]==2) {
-            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * 
+            e=evdwl+colloid1[mtype].x/(numtyp)6.0 *
              ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
-          } 
-          energy+=factor_lj*(e-lj3[mtype].z); 
+          }
+          energy+=factor_lj*(e-lj3[mtype].z);
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
--- a/lib/gpu/lal_colloid.h
+++ b/lib/gpu/lal_colloid.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -24,13 +24,13 @@ template <class numtyp, class acctyp>
 class Colloid : public BaseAtomic<numtyp, acctyp> {
 public:
  Colloid();
-  ~Colloid(); 
+  ~Colloid();

  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -40,11 +40,11 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
  int init(const int ntypes, double **host_cutsq,
           double **host_lj1, double **host_lj2, double **host_lj3,
           double **host_lj4, double **host_offset, double *host_special_lj,
-           double **host_a12, double **host_a1, double **host_a2, 
-           double **host_d1, double **host_d2, double **host_sigma3, 
-           double **host_sigma6, int **host_form, 
-           const int nlocal, const int nall, const int max_nbors, 
-           const int maxspecial, const double cell_size, 
+           double **host_a12, double **host_a1, double **host_a2,
+           double **host_d1, double **host_d2, double **host_sigma3,
+           double **host_sigma6, int **host_form,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen);

  /// Clear all host and device data
@ -65,7 +65,7 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
  UCL_D_Vec<numtyp4> lj3;
  /// colloid1.x = a12, colloid1.y = a1, colloid1.z = a2
  UCL_D_Vec<numtyp4> colloid1;
-  /// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3, 
+  /// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3,
  /// colloid2.w = sigma6
  UCL_D_Vec<numtyp4> colloid2;
  /// form
@ -76,7 +76,7 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;

-  /// Number of atom types 
+  /// Number of atom types
  int _lj_types;

 private:
--- a/lib/gpu/lal_colloid_ext.cpp
+++ b/lib/gpu/lal_colloid_ext.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : nguyentd@ornl.gov
 ***************************************************************************/

@ -29,9 +29,9 @@ static Colloid<PRECISION,ACC_PRECISION> COLLMF;
 // ---------------------------------------------------------------------------
 int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                     double **host_lj2, double **host_lj3, double **host_lj4,
-                     double **offset, double *special_lj, 
-                     double **host_a12, double **host_a1, double **host_a2, 
-                     double **host_d1, double **host_d2, double **host_sigma3, 
+                     double **offset, double *special_lj,
+                     double **host_a12, double **host_a1, double **host_a2,
+                     double **host_d1, double **host_d2, double **host_sigma3,
                     double **host_sigma6, int **host_form, const int inum,
                     const int nall, const int max_nbors,  const int maxspecial,
                     const double cell_size, int &gpu_mode, FILE *screen) {
@ -57,9 +57,9 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,

  int init_ok=0;
  if (world_me==0)
-    init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+    init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                        host_lj4, offset, special_lj, host_a12, host_a1,
-                        host_a2, host_d1, host_d2, host_sigma3, 
+                        host_a2, host_d1, host_d2, host_sigma3,
                        host_sigma6, host_form, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);

@ -78,13 +78,13 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                          offset, special_lj, host_a12, host_a1, host_a2, 
-                          host_d1, host_d2, host_sigma3, host_sigma6, host_form, 
+                          offset, special_lj, host_a12, host_a1, host_a2,
+                          host_d1, host_d2, host_sigma3, host_sigma6, host_form,
                          inum, nall, 300, maxspecial,
                          cell_size, gpu_split, screen);

    COLLMF.device->gpu_barrier();
-    if (message) 
+    if (message)
      fprintf(screen,"Done.\n");
  }
  if (message)
@ -109,8 +109,8 @@ int ** colloid_gpu_compute_n(const int ago, const int inum_full,
  return COLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
-}  
-			
+}
+
 void colloid_gpu_compute(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, int *ilist, int *numj,
                         int **firstneigh, const bool eflag, const bool vflag,
--- a/lib/gpu/lal_coul.cpp
+++ b/lib/gpu/lal_coul.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : ndtrung@umich.edu
 ***************************************************************************/

@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 CoulT::~Coul() {
  clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CoulT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
@ -75,7 +75,7 @@ int CoulT::init(const int ntypes, double **host_scale, double **host_cutsq,

  scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale);
-  
+
  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);

@ -97,10 +97,10 @@ void CoulT::reinit(const int ntypes, double **host_scale) {
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                               UCL_WRITE_ONLY);
-  
+
  for (int i=0; i<_lj_types*_lj_types; i++)
    host_write[i]=0.0;
-  
+
  this->atom->type_pack1(ntypes,_lj_types,scale,host_write,host_scale);
 }

@ -138,7 +138,7 @@ void CoulT::loop(const bool _eflag, const bool _vflag) {
    vflag=1;
  else
    vflag=0;
-  
+
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -149,14 +149,14 @@ void CoulT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &this->ans->force, &this->ans->engv, &eflag,
                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
                          &cutsq, &_qqrd2e, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl,
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv,
                     &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
                     &cutsq, &_qqrd2e, &this->_threads_per_atom);
  }
--- a/lib/gpu/lal_coul.cu
+++ b/lib/gpu/lal_coul.cu
@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : ndtrung@umich.edu
 // ***************************************************************************/

@ -33,14 +33,14 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
                     const __global numtyp *restrict scale,
                     const int lj_types,
                     const __global numtyp *restrict sp_cl_in,
-                     const __global int *dev_nbor, 
-                     const __global int *dev_packed, 
+                     const __global int *dev_nbor,
+                     const __global int *dev_packed,
                     __global acctyp4 *restrict ans,
-                     __global acctyp *restrict engv, 
+                     __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
-                     const int nbor_pitch, 
-                     const __global numtyp *restrict q_, 
-                     const __global numtyp *restrict cutsq, 
+                     const int nbor_pitch,
+                     const __global numtyp *restrict q_,
+                     const __global numtyp *restrict cutsq,
                     const numtyp qqrd2e, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
@ -50,7 +50,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
  sp_cl[1]=sp_cl_in[1];
  sp_cl[2]=sp_cl_in[2];
  sp_cl[3]=sp_cl_in[3];
-  
+
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
@ -58,13 +58,13 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  if (ii<inum) {
    int i, numj, nbor, nbor_end;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;
@ -120,14 +120,14 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
 __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict scale,
                          const __global numtyp *restrict sp_cl_in,
-                          const __global int *dev_nbor, 
+                          const __global int *dev_nbor,
                          const __global int *dev_packed,
                          __global acctyp4 *restrict ans,
-                          __global acctyp *restrict engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nbor_pitch, 
+                          __global acctyp *restrict engv,
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch,
                          const __global numtyp *restrict q_,
-                          const __global numtyp *restrict _cutsq, 
+                          const __global numtyp *restrict _cutsq,
                          const numtyp qqrd2e, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
@ -139,7 +139,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    cutsq[tid]=_cutsq[tid];
  }
-  
+
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
@ -147,15 +147,15 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  __syncthreads();
-  
+
  if (ii<inum) {
    int i, numj, nbor, nbor_end;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
--- a/lib/gpu/lal_coul.h
+++ b/lib/gpu/lal_coul.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : ndtrung@umich.edu
 ***************************************************************************/

@ -30,7 +30,7 @@ class Coul : public BaseCharge<numtyp, acctyp> {
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -39,13 +39,13 @@ class Coul : public BaseCharge<numtyp, acctyp> {
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_scale,
           double **host_cutsq, double *host_special_coul,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen, const double qqrd2e);
-  
+
  /// Send updated coeffs from host to device (to be compatible with fix adapt)
  void reinit(const int ntypes, double **host_scale);
-  
+
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
@ -68,7 +68,7 @@ class Coul : public BaseCharge<numtyp, acctyp> {
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;

-  /// Number of atom types 
+  /// Number of atom types
  int _lj_types;

  numtyp _qqrd2e;
--- a/lib/gpu/lal_coul_debye.cpp
+++ b/lib/gpu/lal_coul_debye.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : ndtrung@umich.edu
 ***************************************************************************/

@ -37,7 +37,7 @@ template <class numtyp, class acctyp>
 CoulDebyeT::~CoulDebye() {
  clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CoulDebyeT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
@ -87,7 +87,7 @@ int CoulDebyeT::init(const int ntypes, double **host_scale,

  _qqrd2e=qqrd2e;
  _kappa=kappa;
-  
+
  _allocated=true;
  this->_max_bytes=cutsq.row_bytes()+scale.row_bytes()+sp_cl.row_bytes();
  return 0;
@ -98,10 +98,10 @@ void CoulDebyeT::reinit(const int ntypes, double **host_scale) {
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                               UCL_WRITE_ONLY);
-  
+
  for (int i=0; i<_lj_types*_lj_types; i++)
    host_write[i]=0.0;
-  
+
  this->atom->type_pack1(ntypes,_lj_types,scale,host_write,host_scale);
 }

@ -139,7 +139,7 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
    vflag=1;
  else
    vflag=0;
-  
+
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -156,9 +156,9 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl,
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
-                     &ainum, &nbor_pitch, &this->atom->q, &cutsq, 
+                     &ainum, &nbor_pitch, &this->atom->q, &cutsq,
                     &_qqrd2e, &_kappa, &this->_threads_per_atom);
  }
  this->time_pair.stop();
--- a/lib/gpu/lal_coul_debye.cu
+++ b/lib/gpu/lal_coul_debye.cu
@ -9,7 +9,7 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                : 
+//    begin                :
 //    email                : ndtrung@umich.edu
 // ***************************************************************************/

@ -31,16 +31,16 @@ texture<int2> q_tex;

 __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict scale,
-                           const int lj_types, 
+                           const int lj_types,
                           const __global numtyp *restrict sp_cl_in,
-                           const __global int *dev_nbor, 
-                           const __global int *dev_packed, 
+                           const __global int *dev_nbor,
+                           const __global int *dev_packed,
                           __global acctyp4 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch,
                           const __global numtyp *restrict q_ ,
-                           const __global numtyp *restrict cutsq, 
+                           const __global numtyp *restrict cutsq,
                           const numtyp qqrd2e, const numtyp kappa,
                           const int t_per_atom) {
  int tid, ii, offset;
@ -59,27 +59,27 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  if (ii<inum) {
    int i, numj, nbor, nbor_end;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

    numtyp factor_coul;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
      int j=dev_packed[nbor];
      factor_coul = sp_cl[sbmask(j)];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
-      
+
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
@ -146,7 +146,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
    scale[tid]=scale_in[tid];
    cutsq[tid]=_cutsq[tid];
  }
-  
+
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
@ -154,15 +154,15 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  __syncthreads();
-  
+
  if (ii<inum) {
    int i, numj, nbor, nbor_end;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
--- a/lib/gpu/lal_coul_debye.h
+++ b/lib/gpu/lal_coul_debye.h
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : ndtrung@umich.edu
 ***************************************************************************/

@ -30,7 +30,7 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
-    * 
+    *
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
@ -39,14 +39,14 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_scale,
           double **host_cutsq, double *host_special_coul,
-           const int nlocal, const int nall, const int max_nbors, 
+           const int nlocal, const int nall, const int max_nbors,
           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen,
           const double qqrd2e, const double kappa);
-  
+
  /// Send updated coeffs from host to device (to be compatible with fix adapt)
  void reinit(const int ntypes, double **host_scale);
-  
+
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
@ -69,7 +69,7 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;

-  /// Number of atom types 
+  /// Number of atom types
  int _lj_types;

  numtyp _qqrd2e,_kappa;
--- a/lib/gpu/lal_coul_debye_ext.cpp
+++ b/lib/gpu/lal_coul_debye_ext.cpp
@ -9,7 +9,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : ndtrung@umich.edu
 ***************************************************************************/

@ -75,7 +75,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq,
                         maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa);

    CDEMF.device->gpu_barrier();
-    if (message) 
+    if (message)
      fprintf(screen,"Done.\n");
  }
  if (message)
@ -93,16 +93,16 @@ void cdebye_gpu_reinit(const int ntypes, double **host_scale) {
  int world_me=CDEMF.device->world_me();
  int gpu_rank=CDEMF.device->gpu_rank();
  int procs_per_gpu=CDEMF.device->procs_per_gpu();
-  
+
  if (world_me==0)
    CDEMF.reinit(ntypes, host_scale);
-  
+
  CDEMF.device->world_barrier();
-  
+
  for (int i=0; i<procs_per_gpu; i++) {
    if (gpu_rank==i && world_me!=0)
      CDEMF.reinit(ntypes, host_scale);
-    
+
    CDEMF.device->gpu_barrier();
  }
 }
@ -123,8 +123,8 @@ int** cdebye_gpu_compute_n(const int ago, const int inum_full,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
-}  
-			
+}
+
 void cdebye_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
--- a/lib/gpu/lal_coul_dsf.cpp
+++ b/lib/gpu/lal_coul_dsf.cpp
@ -37,18 +37,18 @@ template <class numtyp, class acctyp>
 CoulDSFT::~CoulDSF() {
  clear();
 }
- 
+
 template <class numtyp, class acctyp>
 int CoulDSFT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }

 template <class numtyp, class acctyp>
-int CoulDSFT::init(const int ntypes, const int nlocal, const int nall, 
-                   const int max_nbors, const int maxspecial, 
+int CoulDSFT::init(const int ntypes, const int nlocal, const int nall,
+                   const int max_nbors, const int maxspecial,
                   const double cell_size, const double gpu_split, FILE *_screen,
                   const double host_cut_coulsq, double *host_special_coul,
-                   const double qqrd2e, const double e_shift, const double f_shift, 
+                   const double qqrd2e, const double e_shift, const double f_shift,
                   const double alpha) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
@ -123,7 +123,7 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
    vflag=1;
  else
    vflag=0;
-  
+
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -134,15 +134,15 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &this->ans->force, &this->ans->engv, &eflag,
                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
                          &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj, 
-                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
-                     &this->ans->force, &this->ans->engv, 
+    this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv,
                     &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
                     &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
                     &this->_threads_per_atom);
--- a/Show More
+++ b/Show More