git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8694 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2012-08-21 13:59:15 +00:00 · 2012-08-21 13:59:15 +00:00 · 647ea4c29f
parent 31551d81fd
commit 647ea4c29f
49 changed files with 8851 additions and 0 deletions
--- a/lib/gpu/geryon/ucl_matrix.h
+++ b/lib/gpu/geryon/ucl_matrix.h
@ -0,0 +1,224 @@
 /***************************************************************************
                                 ucl_matrix.h
                             -------------------
                               W. Michael Brown
  Matrix Container on Host
 __________________________________________________________________________
    This file is part of the Geryon Unified Coprocessor Library (UCL)
 __________________________________________________________________________
    begin                : Thu May 10 2012
    copyright            : (C) 2012 by W. Michael Brown
    email                : brownw@ornl.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
   This software is distributed under the Simplified BSD License.
   ----------------------------------------------------------------------- */
 // Only allow this file to be included by CUDA and OpenCL specific headers
 #ifdef _UCL_MAT_ALLOW
 /// Matrix S-Object
 template <class hosttype, class devtype>
 class UCL_Matrix {
 public:
  // Traits for copying data
  // MEM_TYPE is 0 for device, 1 for host, and 2 for image
  enum traits {
    DATA_TYPE = _UCL_DATA_ID<hosttype>::id,
    MEM_TYPE = 1,
    PADDED = 0,
    ROW_MAJOR = 1,
    VECTOR = 0
  };
  typedef hosttype data_type; 
  /// Host Allocation
  UCL_H_Mat<hosttype> host;
  /// Device Allocation
  UCL_D_Mat<devtype> device;
  UCL_Matrix() { }
  ~UCL_Matrix() { }
  /// Construct with specied number of rows and columns
  /** \sa alloc() **/
  UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc, 
             const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
             const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
  /// Set up host matrix with specied # of rows/cols and reserve memory
  /** The kind1 parameter controls memory pinning as follows:
    * - UCL_NOT_PINNED      - Memory is not pinned
    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
    * The kind2 parameter controls memory optimizations as follows:
    * - UCL_READ_WRITE - Specify that you will read and write in kernels
    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
    * - UCL_READ_ONLY  - Specify that you will only read in kernels
    * \note When passing a command queue instead of a device, the device
    *       allocation is always performed. Even if the device shares memory
    *       with the host.
    * \param cq Default command queue for operations copied from another mat
    * \return UCL_SUCCESS if the memory allocation is successful **/
  template <class mat_type>
  inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
                   const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
    { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); }
  /// Set up host matrix with specied # of rows/cols and reserve memory
  /** The kind1 parameter controls memory pinning as follows:
    * - UCL_NOT_PINNED      - Memory is not pinned
    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
    * The kind2 parameter controls memory optimizations as follows:
    * - UCL_READ_WRITE - Specify that you will read and write in kernels
    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
    * - UCL_READ_ONLY  - Specify that you will only read in kernels
    * \param device Used to get the default command queue for operations
    * \return UCL_SUCCESS if the memory allocation is successful **/
  inline int alloc(const size_t rows, const size_t cols, UCL_Device &acc,
                   const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
    { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
  /// Free memory and set size to 0
  inline void clear() 
    { host.clear(); device.clear(); }
  /// Resize the allocation to contain cols elements
  inline int resize(const int rows, const int cols) {
    assert(host.kind()!=UCL_VIEW);
    int err=host.resize(rows,cols);
    if (err!=UCL_SUCCESS)
      return err;
    return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
      dev_resize(device,host,_buffer,rows,cols);
  }
  /// Resize (only if bigger) the allocation to contain cols elements
  inline int resize_ib(const int new_rows, const int new_cols)
    { if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols); 
      else return UCL_SUCCESS; }
  /// Set each element to zero
  inline void zero() { host.zero(); device.zero(); }
  /// Set first n elements to zero
  inline void zero(const int n) { host.zero(n); device.zero(n); }
  /// Get the number of elements
  inline size_t numel() const { return host.numel(); }
  /// Get the number of rows
  inline size_t rows() const { return host.rows(); }
  /// Get the number of columns
  inline size_t cols() const { return host.cols(); }
  /// Get the memory usage (bytes) of the s-object (including any buffers)
  inline size_t host_mem_usage() 
    { return host.row_bytes()*host.rows()+_buffer.row_bytes()*_buffer.rows(); }
  /// Get the memory usage (bytes) of the s-object (including any buffers)
  inline size_t device_mem_usage() 
    { return device.row_bytes()*device.rows(); }
  /// Get element at index i
  inline hosttype & operator[](const int i) { return host[i]; }
  /// Get element at index i
  inline const hosttype & operator[](const int i) const { return host[i]; }
  /// 2D access (row should always be 0) 
  inline hosttype & operator()(const int row, const int col) 
    { return host(row,col); }
  /// 2D access (row should always be 0) 
  inline const hosttype & operator()(const int row, const int col) const
    { return host(row,col); }
  /// Returns pointer to memory pointer for allocation on host
  inline hosttype ** host_ptr() { return host.host_ptr(); }
  /// Return the default command queue/stream associated with this data
  inline command_queue & cq() { return host.cq(); }
  /// Block until command_queue associated with matrix is complete
  inline void sync() { host.sync(); }
  ///Get the size of a row on the host (including any padding) in elements
  inline size_t row_size() const { return host.row_size(); }
  /// Get the size of a row on the host(including any padding) in bytes
  inline size_t row_bytes() const { return host.row_bytes(); }
  /// Get the size on the host in bytes of 1 element
  inline int element_size() const { return sizeof(hosttype); }
  /// Update the allocation on the host asynchronously
  inline void update_host() 
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(host,device,_buffer,true); }
  /// Update the allocation on the host (true for asynchronous copy)
  inline void update_host(const bool async)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(host,device,_buffer,async); }
  /// Update the allocation on the host (using command queue)
  inline void update_host(command_queue &cq)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(host,device,_buffer,cq); }
  /// Update the first n elements on the host (true for asynchronous copy)
  inline void update_host(const int n, const bool async)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(host,device,n,_buffer,async); }
  /// Update the first n elements on the host (using command queue)
  inline void update_host(const int n, command_queue &cq)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(host,device,n,_buffer,cq); }
  /// Update slice on the host (true for asynchronous copy)
  inline void update_host(const int rows, const int cols, const bool async)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(host,device,rows,cols,_buffer,async); }
  /// Update slice on the host (using command queue)
  inline void update_host(const int rows, const int cols, command_queue &cq)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(host,device,rows,cols,_buffer,cq); }
  /// Update the allocation on the device asynchronously
  inline void update_device() 
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(device,host,_buffer,true); }
  /// Update the allocation on the device (true for asynchronous copy)
  inline void update_device(const bool async)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(device,host,_buffer,async); }
  /// Update the allocation on the device (using command queue)
  inline void update_device(command_queue &cq)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(device,host,_buffer,cq); }
  /// Update the first n elements on the device (true for asynchronous copy)
  inline void update_device(const int n, const bool async)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(device,host,n,_buffer,async); }
  /// Update the first n elements on the device (using command queue)
  inline void update_device(const int n, command_queue &cq)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(device,host,n,_buffer,cq); }
  /// Update slice on the device (true for asynchronous copy)
  inline void update_device(const int rows, const int cols, const bool async)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(device,host,rows,cols,_buffer,async); }
  /// Update slice on the device (using command queue)
  inline void update_device(const int rows, const int cols, command_queue &cq)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(device,host,rows,cols,_buffer,cq); }
 private:
  UCL_H_Mat<devtype> _buffer;
 };
 #endif
--- a/lib/gpu/geryon/ucl_s_obj_help.h
+++ b/lib/gpu/geryon/ucl_s_obj_help.h
@ -0,0 +1,272 @@
 /***************************************************************************
                              ucl_s_obj_help.h
                             -------------------
                               W. Michael Brown
  Helper routines for allocating memory for s-objects and performing 
  host/device updates. (Different routines depending on whether the
  same type is used on the host and device).
 __________________________________________________________________________
    This file is part of the Geryon Unified Coprocessor Library (UCL)
 __________________________________________________________________________
    begin                : Mon May 14 2012
    copyright            : (C) 2012 by W. Michael Brown
    email                : brownw@ornl.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
    This software is distributed under the Simplified BSD License.
   ----------------------------------------------------------------------- */
 template <int st> struct _ucl_s_obj_help;
 // Host and device containers are same type
 // -- Don't need casting buffers
 // -- Can potentially use same memory if shared by accelerator
 template <> struct _ucl_s_obj_help<1> {
  template <class t1, class t2, class t3>
  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
                          const int cols, UCL_Device &acc,
                          const enum UCL_MEMOPT kind1,
                          const enum UCL_MEMOPT kind2) {
    int e1;
    e1=host.alloc(cols,acc,kind1);
    if (e1!=UCL_SUCCESS)
      return e1;
    if (acc.shared_memory()) {
      device.view(host);
      return UCL_SUCCESS;
    } else
      return device.alloc(cols,acc,kind2);
  }
  template <class t1, class t2, class t3, class mat_type>
  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
                          const int cols, mat_type &cq,
                          const enum UCL_MEMOPT kind1,
                          const enum UCL_MEMOPT kind2) {
    int e1;
    e1=host.alloc(cols,cq,kind1);
    if (e1!=UCL_SUCCESS)
      return e1;
    return device.alloc(cols,cq,kind2);
  }
  template <class t1, class t2, class t3>
  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
                          const int rows, const int cols, UCL_Device &acc,
                          const enum UCL_MEMOPT kind1,
                          const enum UCL_MEMOPT kind2) {
    int e1;
    e1=host.alloc(rows,cols,acc,kind1);
    if (e1!=UCL_SUCCESS)
      return e1;
    if (acc.shared_memory()) {
      device.view(host);
      return UCL_SUCCESS;
    } else
      return device.alloc(rows,cols,acc,kind2);
  }
  template <class t1, class t2, class t3, class mat_type>
  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
                          const int rows, const int cols, mat_type &cq,
                          const enum UCL_MEMOPT kind1,
                          const enum UCL_MEMOPT kind2) {
    int e1;
    e1=host.alloc(rows,cols,cq,kind1);
    if (e1!=UCL_SUCCESS)
      return e1;
    return device.alloc(rows,cols,cq,kind2);
  }
  template <class t1, class t2, class t3>
  static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) {
    ucl_copy(dst,src,async);
  }
  template <class t1, class t2, class t3>
  static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) {
    ucl_copy(dst,src,cq);
  }
  template <class t1, class t2, class t3>
  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, 
                          const bool async) {
    ucl_copy(dst,src,cols,async);
  }
  template <class t1, class t2, class t3>
  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, 
                          command_queue &cq) {
    ucl_copy(dst,src,cols,cq);
  }
  template <class t1, class t2, class t3>
  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, 
                          t3 &buffer, const bool async) {
    ucl_copy(dst,src,rows,cols,async);
  }
  template <class t1, class t2, class t3>
  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, 
                          t3 &buffer, command_queue &cq) {
    ucl_copy(dst,src,rows,cols,cq);
  }
  template <class t1, class t2, class t3>
  static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
    if (device.kind()==UCL_VIEW) {
      device.view(host);
      return UCL_SUCCESS;
    } else
      return device.resize(cols);
  }
  template <class t1, class t2, class t3>
  static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows, 
                               const int cols) {
    if (device.kind()==UCL_VIEW) {
      device.view(host);
      return UCL_SUCCESS;
    } else
      return device.resize(rows,cols);
  }
 };
 // Host and device containers are different types
 template <int st> struct _ucl_s_obj_help {
  template <class t1, class t2, class t3>
  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
                          const int cols, UCL_Device &acc,
                          const enum UCL_MEMOPT kind1,
                          const enum UCL_MEMOPT kind2) {
    int e1;
    e1=host.alloc(cols,acc,UCL_NOT_PINNED);
    if (e1!=UCL_SUCCESS)
      return e1;
    e1=_buffer.alloc(cols,acc,kind1);
    if (e1!=UCL_SUCCESS)
      return e1;
    if (acc.shared_memory()) {
      device.view(_buffer);
      return UCL_SUCCESS;
    } else
      return device.alloc(cols,acc,kind2);
  }
  template <class t1, class t2, class t3, class mat_type>
  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
                          const int cols, mat_type &cq,
                          const enum UCL_MEMOPT kind1,
                          const enum UCL_MEMOPT kind2) {
    int e1;
    e1=host.alloc(cols,cq,UCL_NOT_PINNED);
    if (e1!=UCL_SUCCESS)
      return e1;
    e1=_buffer.alloc(cols,cq,kind1);
    if (e1!=UCL_SUCCESS)
      return e1;
    return device.alloc(cols,cq,kind2); 
  }
  template <class t1, class t2, class t3>
  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
                          const int rows, const int cols, UCL_Device &acc,
                          const enum UCL_MEMOPT kind1,
                          const enum UCL_MEMOPT kind2) {
    int e1;
    e1=host.alloc(rows,cols,acc,UCL_NOT_PINNED);
    if (e1!=UCL_SUCCESS)
      return e1;
    e1=_buffer.alloc(rows,cols,acc,kind1);
    if (e1!=UCL_SUCCESS)
      return e1;
    if (acc.shared_memory()) {
      device.view(_buffer);
      return UCL_SUCCESS;
    } else
      return device.alloc(rows,cols,acc,kind2);
  }
  template <class t1, class t2, class t3, class mat_type>
  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
                          const int rows, const int cols, mat_type &cq,
                          const enum UCL_MEMOPT kind1,
                          const enum UCL_MEMOPT kind2) {
    int e1;
    e1=host.alloc(rows,cols,cq,UCL_NOT_PINNED);
    if (e1!=UCL_SUCCESS)
      return e1;
    e1=_buffer.alloc(rows,cols,cq,kind1);
    if (e1!=UCL_SUCCESS)
      return e1;
    return device.alloc(rows,cols,cq,kind2); 
  }
  template <class t1, class t2, class t3>
  static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) {
    ucl_cast_copy(dst,src,buffer,async);
  }
  template <class t1, class t2, class t3>
  static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) {
    ucl_cast_copy(dst,src,buffer,cq);
  }
  template <class t1, class t2, class t3>
  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, 
                          const bool async) {
    ucl_cast_copy(dst,src,cols,buffer,async);
  }
  template <class t1, class t2, class t3>
  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, 
                          command_queue &cq) {
    ucl_cast_copy(dst,src,cols,buffer,cq);
  }
  template <class t1, class t2, class t3>
  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, 
                          t3 &buffer, const bool async) {
    ucl_cast_copy(dst,src,rows,cols,buffer,async);
  }
  template <class t1, class t2, class t3>
  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, 
                          t3 &buffer, command_queue &cq) {
    ucl_cast_copy(dst,src,rows,cols,buffer,cq);
  }
  template <class t1, class t2, class t3>
  static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
    int err=buff.resize(cols);
    if (err!=UCL_SUCCESS)
      return err;
    if (device.kind()==UCL_VIEW) {
      device.view(buff);
      return UCL_SUCCESS;
    } else
      return device.resize(cols);
  }
  template <class t1, class t2, class t3>
  static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows, 
                               const int cols) {
    int err=buff.resize(rows,cols);
    if (err!=UCL_SUCCESS)
      return err;
    if (device.kind()==UCL_VIEW) {
      device.view(buff);
      return UCL_SUCCESS;
    } else
      return device.resize(rows,cols);
  }
 };
--- a/lib/gpu/geryon/ucl_vector.h
+++ b/lib/gpu/geryon/ucl_vector.h
@ -0,0 +1,223 @@
 /***************************************************************************
                                 ucl_vector.h
                             -------------------
                               W. Michael Brown
  Vector Container on Host
 __________________________________________________________________________
    This file is part of the Geryon Unified Coprocessor Library (UCL)
 __________________________________________________________________________
    begin                : Thu May 10 2012
    copyright            : (C) 2012 by W. Michael Brown
    email                : brownw@ornl.gov
 ***************************************************************************/
 /* -----------------------------------------------------------------------
    This software is distributed under the Simplified BSD License.
   ----------------------------------------------------------------------- */
 // Only allow this file to be included by CUDA and OpenCL specific headers
 #ifdef _UCL_MAT_ALLOW
 /// Row Vector S-Object
 template <class hosttype, class devtype>
 class UCL_Vector {
 public:
  // Traits for copying data
  // MEM_TYPE is 0 for device, 1 for host, and 2 for image
  enum traits {
    DATA_TYPE = _UCL_DATA_ID<hosttype>::id,
    MEM_TYPE = 1,
    PADDED = 0,
    ROW_MAJOR = 1,
    VECTOR = 1
  };
  typedef hosttype data_type; 
  /// Host Allocation
  UCL_H_Vec<hosttype> host;
  /// Device Allocation
  UCL_D_Vec<devtype> device;
  UCL_Vector() { }
  ~UCL_Vector() { }
  /// Construct with n columns
  /** \sa alloc() **/
  UCL_Vector(const size_t cols, UCL_Device &acc, 
             const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
             const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,cols,acc,kind1,kind2); }
  /// Set up the vector with 'cols' columns and reserve memory
  /** The kind1 parameter controls memory pinning as follows:
    * - UCL_NOT_PINNED      - Memory is not pinned
    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
    * The kind2 parameter controls memory optimizations as follows:
    * - UCL_READ_WRITE - Specify that you will read and write in kernels
    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
    * - UCL_READ_ONLY  - Specify that you will only read in kernels
    * \note When passing a command queue instead of a device, the device
    *       allocation is always performed. Even if the device shares memory
    *       with the host.
    * \param cq Default command queue for operations copied from another mat
    * \return UCL_SUCCESS if the memory allocation is successful **/
  template <class mat_type>
  inline int alloc(const size_t cols, mat_type &cq,
                   const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
    { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,cols,cq,kind1,kind2); }
  /// Set up host vector with 'cols' columns and reserve memory
  /** The kind1 parameter controls memory pinning as follows:
    * - UCL_NOT_PINNED      - Memory is not pinned
    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
    * The kind2 parameter controls memory optimizations as follows:
    * - UCL_READ_WRITE - Specify that you will read and write in kernels
    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
    * - UCL_READ_ONLY  - Specify that you will only read in kernels
    * \param device Used to get the default command queue for operations
    * \return UCL_SUCCESS if the memory allocation is successful **/
  inline int alloc(const size_t cols, UCL_Device &acc,
                   const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE) 
    { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        alloc(host,device,_buffer,cols,acc,kind1,kind2); }
  /// Free memory and set size to 0
  inline void clear() 
    { host.clear(); device.clear(); }
  /// Resize the allocation to contain cols elements
  inline int resize(const int cols) {
    assert(host.kind()!=UCL_VIEW);
    int err=host.resize(cols);
    if (err!=UCL_SUCCESS)
      return err;
    return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
      dev_resize(device,host,_buffer,cols);
  }
  /// Resize (only if bigger) the allocation to contain cols elements
  inline int resize_ib(const int new_cols)
    { if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; }
  /// Set each element to zero
  inline void zero() { host.zero(); device.zero(); }
  /// Set first n elements to zero
  inline void zero(const int n) { host.zero(n); device.zero(n); }
  /// Get the number of elements
  inline size_t numel() const { return host.numel(); }
  /// Get the number of rows
  inline size_t rows() const { return host.rows(); }
  /// Get the number of columns
  inline size_t cols() const { return host.cols(); }
  /// Get the memory usage (bytes) of the s-object (including any buffers)
  inline size_t host_mem_usage() 
    { return host.row_bytes()+_buffer.row_bytes(); }
  /// Get the memory usage (bytes) of the s-object (including any buffers)
  inline size_t device_mem_usage() 
    { return device.row_bytes(); }
  /// Get element at index i
  inline hosttype & operator[](const int i) { return host[i]; }
  /// Get element at index i
  inline const hosttype & operator[](const int i) const { return host[i]; }
  /// 2D access (row should always be 0) 
  inline hosttype & operator()(const int row, const int col) 
    { return host[col]; }
  /// 2D access (row should always be 0) 
  inline const hosttype & operator()(const int row, const int col) const
    { return host[col]; }
  /// Returns pointer to memory pointer for allocation on host
  inline hosttype ** host_ptr() { return host.host_ptr(); }
  /// Return the default command queue/stream associated with this data
  inline command_queue & cq() { return host.cq(); }
  /// Block until command_queue associated with matrix is complete
  inline void sync() { host.sync(); }
  ///Get the size of a row on the host (including any padding) in elements
  inline size_t row_size() const { return host.row_size(); }
  /// Get the size of a row on the host(including any padding) in bytes
  inline size_t row_bytes() const { return host.row_bytes(); }
  /// Get the size on the host in bytes of 1 element
  inline int element_size() const { return sizeof(hosttype); }
  /// Update the allocation on the host asynchronously
  inline void update_host() 
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(host,device,_buffer,true); }
  /// Update the allocation on the host (true for asynchronous copy)
  inline void update_host(const bool async)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(host,device,_buffer,async); }
  /// Update the allocation on the host (using command queue)
  inline void update_host(command_queue &cq)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(host,device,_buffer,cq); }
  /// Update the first n elements on the host (true for asynchronous copy)
  inline void update_host(const int n, const bool async)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(host,device,n,_buffer,async); }
  /// Update the first n elements on the host (using command queue)
  inline void update_host(const int n, command_queue &cq)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(host,device,n,_buffer,cq); }
  /// Update slice on the host (true for asynchronous copy)
  inline void update_host(const int rows, const int cols, const bool async)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(host,device,rows,cols,_buffer,async); }
  /// Update slice on the host (using command queue)
  inline void update_host(const int rows, const int cols, command_queue &cq)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(host,device,rows,cols,_buffer,cq); }
  /// Update the allocation on the device asynchronously
  inline void update_device() 
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(device,host,_buffer,true); }
  /// Update the allocation on the device (true for asynchronous copy)
  inline void update_device(const bool async)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(device,host,_buffer,async); }
  /// Update the allocation on the device (using command queue)
  inline void update_device(command_queue &cq)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(device,host,_buffer,cq); }
  /// Update the first n elements on the device (true for asynchronous copy)
  inline void update_device(const int n, const bool async)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(device,host,n,_buffer,async); }
  /// Update the first n elements on the device (using command queue)
  inline void update_device(const int n, command_queue &cq)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(device,host,n,_buffer,cq); }
  /// Update slice on the device (true for asynchronous copy)
  inline void update_device(const int rows, const int cols, const bool async)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(device,host,rows,cols,_buffer,async); }
  /// Update slice on the device (using command queue)
  inline void update_device(const int rows, const int cols, command_queue &cq)
    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
        copy(device,host,rows,cols,_buffer,cq); }
 private:
  UCL_H_Vec<devtype> _buffer;
 };
 #endif
--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@ -0,0 +1,315 @@
 /***************************************************************************
                               base_dipole.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Base class for pair styles needing per-particle data for position,
  dipole, and type.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #include "lal_base_dipole.h"
 using namespace LAMMPS_AL;
 #define BaseDipoleT BaseDipole<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> global_device;
 template <class numtyp, class acctyp>
 BaseDipoleT::BaseDipole() : _compiled(false), _max_bytes(0) {
  device=&global_device;
  ans=new Answer<numtyp,acctyp>();
  nbor=new Neighbor();
 }
 template <class numtyp, class acctyp>
 BaseDipoleT::~BaseDipole() {
  delete ans;
  delete nbor;
 }
 template <class numtyp, class acctyp>
 int BaseDipoleT::bytes_per_atom_atomic(const int max_nbors) const {
  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
         nbor->bytes_per_atom(max_nbors);
 }
 template <class numtyp, class acctyp>
 int BaseDipoleT::init_atomic(const int nlocal, const int nall,
                             const int max_nbors, const int maxspecial,
                             const double cell_size,
                             const double gpu_split, FILE *_screen,
                             const void *pair_program,
                             const char *k_name) {
  screen=_screen;
  int gpu_nbor=0;
  if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
    gpu_nbor=1;
  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
    gpu_nbor=2;
  int _gpu_host=0;
  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
  if (host_nlocal>0)
    _gpu_host=1;
  _threads_per_atom=device->threads_per_charge();
  if (_threads_per_atom>1 && gpu_nbor==0) {
    nbor->packing(true);
    _nbor_data=&(nbor->dev_packed);
  } else
    _nbor_data=&(nbor->dev_nbor);
  int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor,
                           maxspecial,_gpu_host,max_nbors,cell_size,false,
                           _threads_per_atom);
  if (success!=0)
    return success;
  ucl_device=device->gpu;
  atom=&device->atom;
  _block_size=device->pair_block_size();
  _block_bio_size=device->block_bio_pair();
  compile_kernels(*ucl_device,pair_program,k_name);
  // Initialize host-device load balancer
  hd_balancer.init(device,gpu_nbor,gpu_split);
  // Initialize timers for the selected GPU
  time_pair.init(*ucl_device);
  time_pair.zero();
  pos_tex.bind_float(atom->x,4);
  q_tex.bind_float(atom->q,1);
  mu_tex.bind_float(atom->quat,4);
  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  return success;
 }
 template <class numtyp, class acctyp>
 void BaseDipoleT::estimate_gpu_overhead() {
  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
 }
 template <class numtyp, class acctyp>
 void BaseDipoleT::clear_atomic() {
  // Output any timing information
  acc_timers();
  double avg_split=hd_balancer.all_avg_split();
  _gpu_overhead*=hd_balancer.timestep();
  _driver_overhead*=hd_balancer.timestep();
  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
  if (_compiled) {
    k_pair_fast.clear();
    k_pair.clear();
    delete pair_program;
    _compiled=false;
  }
  time_pair.clear();
  hd_balancer.clear();
  nbor->clear();
  ans->clear();
  device->clear();
 }
 // ---------------------------------------------------------------------------
 // Copy neighbor list from host
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int * BaseDipoleT::reset_nbors(const int nall, const int inum, int *ilist,
                                   int *numj, int **firstneigh, bool &success) {
  success=true;
  int mn=nbor->max_nbor_loop(inum,numj,ilist);
  resize_atom(inum,nall,success);
  resize_local(inum,mn,success);
  if (!success)
    return false;
  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
  return ilist;
 }
 // ---------------------------------------------------------------------------
 // Build neighbor list on device
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum,
                                         const int nall, double **host_x,
                                         int *host_type, double *sublo,
                                         double *subhi, int *tag, 
                                         int **nspecial, int **special,
                                         bool &success) {
  success=true;
  resize_atom(inum,nall,success);
  resize_local(inum,host_inum,nbor->max_nbors(),success);
  if (!success)
    return;
  atom->cast_copy_x(host_x,host_type);
  int mn;
  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
                        nspecial, special, success, mn);
  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
 }
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BaseDipoleT::compute(const int f_ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          int *ilist, int *numj, int **firstneigh,
                          const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom,
                          int &host_start, const double cpu_time,
                          bool &success, double *host_q, double **host_mu,
                          const int nlocal, double *boxlo, double *prd) {
  acc_timers();
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
    resize_atom(0,nall,success);
    zero_timers();
    return;
  }
  int ago=hd_balancer.ago_first(f_ago);
  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
  ans->inum(inum);
  host_start=inum;
  if (ago==0) {
    reset_nbors(nall, inum, ilist, numj, firstneigh, success);
    if (!success)
      return;
  }
  atom->cast_x_data(host_x,host_type);
  atom->cast_q_data(host_q);
  atom->cast_quat_data(host_mu[0]);
  hd_balancer.start_timer();
  atom->add_x_data(host_x,host_type);
  atom->add_q_data();
  atom->add_quat_data();
  device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
                     boxlo, prd);
  loop(eflag,vflag);
  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
 }
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary and then compute forces, virials, energies
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int** BaseDipoleT::compute(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           double *sublo, double *subhi, int *tag,
                           int **nspecial, int **special, const bool eflag, 
                           const bool vflag, const bool eatom,
                           const bool vatom, int &host_start,
                           int **ilist, int **jnum,
                           const double cpu_time, bool &success,
                           double *host_q, double **host_mu, 
                           double *boxlo, double *prd) {
  acc_timers();
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
    resize_atom(0,nall,success);
    zero_timers();
    return NULL;
  }
  hd_balancer.balance(cpu_time);
  int inum=hd_balancer.get_gpu_count(ago,inum_full);
  ans->inum(inum);
  host_start=inum;
  // Build neighbor list on GPU if necessary
  if (ago==0) {
    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                    sublo, subhi, tag, nspecial, special, success);
    if (!success)
      return NULL;
    atom->cast_q_data(host_q);
    atom->cast_quat_data(host_mu[0]);
    hd_balancer.start_timer();
  } else {
    atom->cast_x_data(host_x,host_type);
    atom->cast_q_data(host_q);
    atom->cast_quat_data(host_mu[0]);
    hd_balancer.start_timer();
    atom->add_x_data(host_x,host_type);
  }
  atom->add_q_data();
  atom->add_quat_data();
  *ilist=nbor->host_ilist.begin();
  *jnum=nbor->host_acc.begin();
  device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
                     boxlo, prd);
  loop(eflag,vflag);
  ans->copy_answers(eflag,vflag,eatom,vatom);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
  return nbor->host_jlist.begin()-host_start;
 }
 template <class numtyp, class acctyp>
 double BaseDipoleT::host_memory_usage_atomic() const {
  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
         4*sizeof(numtyp)+sizeof(BaseDipole<numtyp,acctyp>);
 }
 template <class numtyp, class acctyp>
 void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                  const char *kname) {
  if (_compiled)
    return;
  std::string s_fast=std::string(kname)+"_fast";
  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
                    std::string(OCL_PRECISION_COMPILE)+" -D"+
                    std::string(OCL_VENDOR);
  pair_program=new UCL_Program(dev);
  pair_program->load_string(pair_str,flags.c_str());
  k_pair_fast.set_function(*pair_program,s_fast.c_str());
  k_pair.set_function(*pair_program,kname);
  pos_tex.get_texture(*pair_program,"pos_tex");
  q_tex.get_texture(*pair_program,"q_tex");
  mu_tex.get_texture(*pair_program,"mu_tex");
  _compiled=true;
 }
 template class BaseDipole<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_base_dipole.h
+++ b/lib/gpu/lal_base_dipole.h
@ -0,0 +1,200 @@
 /***************************************************************************
                                base_dipole.h
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Base class for pair styles needing per-particle data for position,
  dipole, and type.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifndef LAL_BASE_DIPOLE_H
 #define LAL_BASE_DIPOLE_H
 #include "lal_device.h"
 #include "lal_balance.h"
 #include "mpi.h"
 #ifdef USE_OPENCL
 #include "geryon/ocl_texture.h"
 #else
 #include "geryon/nvd_texture.h"
 #endif
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class BaseDipole {
 public:
  BaseDipole();
  virtual ~BaseDipole();
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * \param k_name name for the kernel for force calculation
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init_atomic(const int nlocal, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size,
                  const double gpu_split, FILE *screen,
                  const void *pair_program, const char *k_name);
  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead();
  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
    if (atom->resize(nall, success)) {
      pos_tex.bind_float(atom->x,4);
      q_tex.bind_float(atom->q,1);
      mu_tex.bind_float(atom->quat,4);
    }
    ans->resize(inum,success);
  }
  /// Check if there is enough storage for neighbors and realloc if not
  /** \param nlocal number of particles whose nbors must be stored on device
    * \param host_inum number of particles whose nbors need to copied to host
    * \param current maximum number of neighbors
    * \note olist_size=total number of local particles **/
  inline void resize_local(const int inum, const int max_nbors, bool &success) {
    nbor->resize(inum,max_nbors,success);
  }
  /// Check if there is enough storage for neighbors and realloc if not
  /** \param nlocal number of particles whose nbors must be stored on device
    * \param host_inum number of particles whose nbors need to copied to host
    * \param current maximum number of neighbors
    * \note host_inum is 0 if the host is performing neighboring
    * \note nlocal+host_inum=total number local particles
    * \note olist_size=0 **/
  inline void resize_local(const int inum, const int host_inum, 
                           const int max_nbors, bool &success) {
    nbor->resize(inum,host_inum,max_nbors,success);
  }
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear_atomic();
  /// Returns memory usage on device per atom
  int bytes_per_atom_atomic(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage_atomic() const;
  /// Accumulate timers
  inline void acc_timers() {
    if (device->time_device()) {
      nbor->acc_timers();
      time_pair.add_to_total();
      atom->acc_timers();
      ans->acc_timers();
    }
  }
  /// Zero timers
  inline void zero_timers() {
    time_pair.zero();
    atom->zero_timers();
    ans->zero_timers();
  }
  /// Copy neighbor list from host
  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
                    int **firstneigh, bool &success);
  /// Build neighbor list on device
  void build_nbor_list(const int inum, const int host_inum,
                       const int nall, double **host_x, int *host_type,
                       double *sublo, double *subhi, int *tag, int **nspecial,
                       int **special, bool &success);
  /// Pair loop with host neighboring
  void compute(const int f_ago, const int inum_full, const int nall,
               double **host_x, int *host_type, int *ilist, int *numj,
               int **firstneigh, const bool eflag, const bool vflag,
               const bool eatom, const bool vatom, int &host_start,
               const double cpu_time, bool &success, double *charge,
               double **mu, const int nlocal, double *boxlo, double *prd);
  /// Pair loop with device neighboring
  int** compute(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, double *sublo,
                double *subhi, int *tag, int **nspecial,
                int **special, const bool eflag, const bool vflag, 
                const bool eatom, const bool vatom, int &host_start, 
                int **ilist, int **numj, const double cpu_time, bool &success,
                double *charge, double **mu, double *boxlo, double *prd);
  // -------------------------- DEVICE DATA ------------------------- 
  /// Device Properties and Atom and Neighbor storage
  Device<numtyp,acctyp> *device;
  /// Geryon device
  UCL_Device *ucl_device;
  /// Device Timers
  UCL_Timer time_pair;
  /// Host device load balancer
  Balance<numtyp,acctyp> hd_balancer;
  /// LAMMPS pointer for screen output
  FILE *screen;
  // --------------------------- ATOM DATA --------------------------
  /// Atom Data
  Atom<numtyp,acctyp> *atom;
  // ------------------------ FORCE/ENERGY DATA -----------------------
  Answer<numtyp,acctyp> *ans;
  // --------------------------- NBOR DATA ----------------------------
  /// Neighbor data
  Neighbor *nbor;
  // ------------------------- DEVICE KERNELS -------------------------
  UCL_Program *pair_program;
  UCL_Kernel k_pair_fast, k_pair;
  inline int block_size() { return _block_size; }
  // --------------------------- TEXTURES -----------------------------
  UCL_Texture pos_tex;
  UCL_Texture q_tex;
  UCL_Texture mu_tex;
 protected:
  bool _compiled;
  int _block_size, _block_bio_size, _threads_per_atom;
  double  _max_bytes, _max_an_bytes;
  double _gpu_overhead, _driver_overhead;
  UCL_D_Vec<int> *_nbor_data;
  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
  virtual void loop(const bool _eflag, const bool _vflag) = 0;
 };
 }
 #endif
--- a/lib/gpu/lal_born.cpp
+++ b/lib/gpu/lal_born.cpp
@ -0,0 +1,162 @@
 /***************************************************************************
                                  born.cpp
                             -------------------
                           Trung Dac Nguyen (ORNL)
  Class for acceleration of the born pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifdef USE_OPENCL
 #include "born_cl.h"
 #elif defined(USE_CUDART)
 const char *born=0;
 #else
 #include "born_cubin.h"
 #endif
 #include "lal_born.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define BornT Born<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> device;
 template <class numtyp, class acctyp>
 BornT::Born() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 template <class numtyp, class acctyp>
 BornT::~Born() { 
  clear();
 }
 template <class numtyp, class acctyp>
 int BornT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }
 template <class numtyp, class acctyp>
 int BornT::init(const int ntypes, double **host_cutsq,
                double **host_rhoinv, double **host_born1, double **host_born2, 
                double **host_born3, double **host_a, double **host_c,
                double **host_d, double **host_sigma,
                double **host_offset, double *host_special_lj,
                const int nlocal, const int nall, const int max_nbors, 
                const int maxspecial, const double cell_size, 
                const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,born,"k_born");
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_OPTIMIZED);
  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
  coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv,
                         host_born1,host_born2,host_born3);
  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
 		                     host_d,host_offset);
  cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack2(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
                         host_sigma);
  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
  dview.view(host_special_lj,4,*(this->ucl_device));
  ucl_copy(sp_lj,dview,false);
  _allocated=true;
  this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()
   +cutsq_sigma.row_bytes()+sp_lj.row_bytes();
  return 0;
 }
 template <class numtyp, class acctyp>
 void BornT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
  coeff1.clear();
  coeff2.clear();
  cutsq_sigma.clear();
  sp_lj.clear();
  this->clear_atomic();
 }
 template <class numtyp, class acctyp>
 double BornT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(Born<numtyp,acctyp>);
 }
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BornT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int eflag, vflag;
  if (_eflag)
    eflag=1;
  else
    eflag=0;
  if (_vflag)
    vflag=1;
  else
    vflag=0;
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &coeff1,&coeff2,
                          &cutsq_sigma, &sp_lj,
                          &this->nbor->dev_nbor,
                          &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &coeff1, &coeff2,
                     &cutsq_sigma, &_lj_types, &sp_lj, 
                     &this->nbor->dev_nbor,
                     &this->_nbor_data->begin(), &this->ans->force,
                     &this->ans->engv, &eflag, &vflag, &ainum,
                     &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
 template class Born<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@ -0,0 +1,201 @@
 // **************************************************************************
 //                                   born.cu
 //                             -------------------
 //                           Trung Dac Nguyen (ORNL)
 //
 //  Device code for acceleration of the born pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 #else
 texture<int4,1> pos_tex;
 #endif
 #else
 #define pos_tex x_
 #endif
 __kernel void k_born(__global numtyp4 *x_, __global numtyp4 *coeff1,
                          __global numtyp4* coeff2, 
                          __global numtyp2 *cutsq_sigma,
                          const int lj_types, __global numtyp *sp_lj_in, 
                          __global int *dev_nbor, __global int *dev_packed, 
                          __global acctyp4 *ans, __global acctyp *engv, 
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp sp_lj[4];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    numtyp factor_lj;
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp r2inv = delx*delx+dely*dely+delz*delz;
      int mtype=itype*lj_types+jtype;
      if (r2inv<cutsq_sigma[mtype].x) {
        numtyp r=ucl_sqrt(r2inv);
        numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
        r2inv=ucl_recip(r2inv);
        numtyp r6inv = r2inv*r2inv*r2inv;
        numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
                - coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
        force*=factor_lj;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
            + coeff2[mtype].z*r2inv*r6inv;
          energy+=factor_lj*(e-coeff2[mtype].w); 
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                  ans,engv);
  } // if ii
 }
 __kernel void k_born_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
                               __global numtyp4* coeff2_in, 
                               __global numtyp2 *cutsq_sigma,
                               __global numtyp* sp_lj_in, 
                               __global int *dev_nbor, __global int *dev_packed, 
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
                               const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
  if (tid<4)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    coeff1[tid]=coeff1_in[tid];
    if (eflag>0)
      coeff2[tid]=coeff2_in[tid];
  }
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    numtyp factor_lj;
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp r2inv = delx*delx+dely*dely+delz*delz;
      if (r2inv<cutsq_sigma[mtype].x) {
        numtyp r=ucl_sqrt(r2inv);
        numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
        r2inv=ucl_recip(r2inv);
        numtyp r6inv = r2inv*r2inv*r2inv;
        numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
          - coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
        force*=factor_lj;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
            + coeff2[mtype].z*r2inv*r6inv;
          energy+=factor_lj*(e-coeff2[mtype].w);  
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                  ans,engv);
  } // if ii
 }
--- a/lib/gpu/lal_born.h
+++ b/lib/gpu/lal_born.h
@ -0,0 +1,84 @@
 /***************************************************************************
                                   born.h
                             -------------------
                           Trung Dac Nguyen (ORNL)
  Class for acceleration of the born pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifndef LAL_BORN_H
 #define LAL_BORN_H
 #include "lal_base_atomic.h"
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class Born : public BaseAtomic<numtyp, acctyp> {
 public:
  Born();
  ~Born(); 
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq,
           double **host_rhoinv, double **host_born1, double **host_born2, 
           double **host_born3, double **host_a, double **host_c,
           double **host_d, double **host_sigma, 
           double **host_offset, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size, 
           const double gpu_split, FILE *screen);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage() const;
  // --------------------------- TYPE DATA --------------------------
  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
  /// coeff1.w = born3
  UCL_D_Vec<numtyp4> coeff1;
  /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
  UCL_D_Vec<numtyp4> coeff2;
  /// cutsq_sigma
  UCL_D_Vec<numtyp2> cutsq_sigma;
  /// Special LJ values
  UCL_D_Vec<numtyp> sp_lj;
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
  /// Number of atom types 
  int _lj_types;
 private:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
 };
 }
 #endif
--- a/lib/gpu/lal_born_coul_long.cpp
+++ b/lib/gpu/lal_born_coul_long.cpp
@ -0,0 +1,175 @@
 /***************************************************************************
                             born_coul_long.cpp
                             -------------------
                           Trung Dac Nguyen (ORNL)
  Class for acceleration of the born/coul/long pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifdef USE_OPENCL
 #include "born_coul_long_cl.h"
 #elif defined(USE_CUDART)
 const char *born_coul_long=0;
 #else
 #include "born_coul_long_cubin.h"
 #endif
 #include "lal_born_coul_long.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define BornCoulLongT BornCoulLong<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> device;
 template <class numtyp, class acctyp>
 BornCoulLongT::BornCoulLong() : BaseCharge<numtyp,acctyp>(),
                                    _allocated(false) {
 }
 template <class numtyp, class acctyp>
 BornCoulLongT::~BornCoulLongT() {
  clear();
 }
 template <class numtyp, class acctyp>
 int BornCoulLongT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }
 template <class numtyp, class acctyp>
 int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
                       double **host_born1, double **host_born2, double **host_born3, 
                       double **host_a, double **host_c, double **host_d, 
                       double **host_sigma, double **host_offset, 
                       double *host_special_lj, const int nlocal,
                       const int nall, const int max_nbors,
                       const int maxspecial, const double cell_size,
                       const double gpu_split, FILE *_screen,
                       double **host_cut_ljsq, const double host_cut_coulsq,
                       double *host_special_coul, const double qqrd2e,
                       const double g_ewald) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,born_coul_long,"k_born_long");
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_OPTIMIZED);
  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
  coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv,
                         host_born1,host_born2,host_born3);
  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
 		         host_d,host_offset);
  cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
             host_cut_ljsq,host_sigma);
  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
    host_write[i]=host_special_lj[i];
    host_write[i+4]=host_special_coul[i];
  }
  ucl_copy(sp_lj,host_write,8,false);
  _cut_coulsq=host_cut_coulsq;
  _qqrd2e=qqrd2e;
  _g_ewald=g_ewald;
  _allocated=true;
  this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()
      +cutsq_sigma.row_bytes()+sp_lj.row_bytes();
  return 0;
 }
 template <class numtyp, class acctyp>
 void BornCoulLongT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
  coeff1.clear();
  coeff2.clear();
  cutsq_sigma.clear();
  sp_lj.clear();
  this->clear_atomic();
 }
 template <class numtyp, class acctyp>
 double BornCoulLongT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(BornCoulLong<numtyp,acctyp>);
 }
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int eflag, vflag;
  if (_eflag)
    eflag=1;
  else
    eflag=0;
  if (_vflag)
    vflag=1;
  else
    vflag=0;
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                          &this->nbor->dev_nbor,
                          &this->_nbor_data->begin(),
                          &this->ans->force,
                          &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->atom->q,
                          &cutsq_sigma, &_cut_coulsq, &_qqrd2e, 
                          &_g_ewald, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, 
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                   &this->ans->force, &this->ans->engv, 
                   &eflag, &vflag, &ainum,
                   &nbor_pitch, &this->atom->q, 
                   &cutsq_sigma, &_cut_coulsq,
                   &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
 template class BornCoulLong<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@ -0,0 +1,262 @@
 // **************************************************************************
 //                              buck_coul_long.cu
 //                             -------------------
 //                           Trung Dac Nguyen (ORNL)
 //
 //  Device code for acceleration of the buck/coul/long pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
 #else
 texture<int4,1> pos_tex;
 texture<int2> q_tex;
 #endif
 #else
 #define pos_tex x_
 #define q_tex q_
 #endif
 __kernel void k_born_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
                          __global numtyp4* coeff2, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
                          __global acctyp *engv, const int eflag, 
                          const int vflag, const int inum,
                          const int nbor_pitch, __global numtyp *q_,
                          __global numtyp4 *cutsq_sigma, 
                          const numtyp cut_coulsq, const numtyp qqrd2e,
                          const numtyp g_ewald, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp sp_lj[8];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];
  sp_lj[4]=sp_lj_in[4];
  sp_lj[5]=sp_lj_in[5];
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      int mtype=itype*lj_types+jtype;
      if (rsq<cutsq_sigma[mtype].x) { // cutsq
        numtyp r2inv = ucl_recip(rsq);
        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
        numtyp rexp = (numtyp)0.0;
        if (rsq < cut_coulsq) {
          numtyp r = ucl_rsqrt(r2inv);
          numtyp grij = g_ewald * r;
          numtyp expm2 = ucl_exp(-grij*grij);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
          fetch(prefactor,j,q_tex);
          prefactor *= qqrd2e * qtmp/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
        } else forcecoul = (numtyp)0.0;
        if (rsq < cutsq_sigma[mtype].y) {
          numtyp r = ucl_sqrt(rsq);
          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
          r6inv = r2inv*r2inv*r2inv;
          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
        } else forceborn = (numtyp)0.0;
        force = (forceborn + forcecoul) * r2inv;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          if (rsq < cut_coulsq)
            e_coul += prefactor*(_erfc-factor_coul);
          if (rsq < coeff1[mtype].w) {
            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
              + coeff2[mtype].z*r2inv*r6inv;
            energy+=factor_lj*(e-coeff2[mtype].w);
          } 
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                    vflag,ans,engv);
  } // if ii
 }
 __kernel void k_born_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
                               __global numtyp4* coeff2_in, 
                               __global numtyp* sp_lj_in,
                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
                               const int nbor_pitch, __global numtyp *q_,
                               __global numtyp4 *cutsq_sigma,
                               const numtyp cut_coulsq, const numtyp qqrd2e,
                               const numtyp g_ewald, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
  if (tid<8)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    coeff1[tid]=coeff1_in[tid];
    if (eflag>0)
      coeff2[tid]=coeff2_in[tid];
  }
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      if (rsq<cutsq_sigma[mtype].x) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
        numtyp rexp = (numtyp)0.0;
        if (rsq < cut_coulsq) {
          numtyp r=ucl_rsqrt(r2inv);
          numtyp grij = g_ewald * r;
          numtyp expm2 = ucl_exp(-grij*grij);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
          fetch(prefactor,j,q_tex);
          prefactor *= qqrd2e * qtmp/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
        } else forcecoul = (numtyp)0.0;
        if (rsq < cutsq_sigma[mtype].y) {
          numtyp r = ucl_sqrt(rsq);
          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
          r6inv = r2inv*r2inv*r2inv;
          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
        } else forceborn = (numtyp)0.0;
        force = (forceborn + forcecoul) * r2inv;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          if (rsq < cut_coulsq)
            e_coul += prefactor*(_erfc-factor_coul);
          if (rsq < coeff1[mtype].w) {
            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
              + coeff2[mtype].z*r2inv*r6inv;
            energy+=factor_lj*(e-coeff2[mtype].w);
          }
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                    vflag,ans,engv);
  } // if ii
 }
--- a/lib/gpu/lal_born_coul_long.h
+++ b/lib/gpu/lal_born_coul_long.h
@ -0,0 +1,88 @@
 /***************************************************************************
                              born_coul_long.h
                             -------------------
                           Trung Dac Nguyen (ORNL)
  Class for acceleration of the born/coul/long pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifndef LAL_BORN_COUL_LONG_H
 #define LAL_BORN_COUL_LONG_H
 #include "lal_base_charge.h"
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class BornCoulLong : public BaseCharge<numtyp, acctyp> {
 public:
  BornCoulLong();
  ~BornCoulLong();
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
           double **host_born1, double **host_born2, double **host_born3, 
           double **host_a, double **host_c, double **host_d, 
           double **host_sigma, double **host_offset, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size, 
           const double gpu_split, FILE *screen, double **host_cut_ljsq,
           const double host_cut_coulsq, double *host_special_coul,
           const double qqrd2e, const double g_ewald);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage() const;
  // --------------------------- TYPE DATA --------------------------
  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, 
  /// coeff1.w = born3
  UCL_D_Vec<numtyp4> coeff1;
  /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
  UCL_D_Vec<numtyp4> coeff2;
  /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, 
  /// cutsq_sigma.z = sigma
  UCL_D_Vec<numtyp4> cutsq_sigma;
  /// Special LJ values [0-3] and Special Coul values [4-7]
  UCL_D_Vec<numtyp> sp_lj;
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
  /// Number of atom types 
  int _lj_types;
  numtyp _cut_coulsq, _qqrd2e, _g_ewald;
 private:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
 };
 }
 #endif
--- a/lib/gpu/lal_born_coul_long_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_ext.cpp
@ -0,0 +1,132 @@
 /***************************************************************************
                           born_coul_long_ext.cpp
                             -------------------
                           Trung Dac Nguyen (ORNL)
  Functions for LAMMPS access to born/coul/long acceleration routines.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #include <iostream>
 #include <cassert>
 #include <math.h>
 #include "lal_born_coul_long.h"
 using namespace std;
 using namespace LAMMPS_AL;
 static BornCoulLong<PRECISION,ACC_PRECISION> BORNCLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                    double **host_born1, double **host_born2, double **host_born3,
                    double **host_a, double **host_c, double **host_d,
                    double **sigma, double **offset, double *special_lj, 
                    const int inum, const int nall, const int max_nbors, 
                    const int maxspecial, const double cell_size, int &gpu_mode, 
                    FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
                    double *host_special_coul, const double qqrd2e,
                    const double g_ewald) {
  BORNCLMF.clear();
  gpu_mode=BORNCLMF.device->gpu_mode();
  double gpu_split=BORNCLMF.device->particle_split();
  int first_gpu=BORNCLMF.device->first_device();
  int last_gpu=BORNCLMF.device->last_device();
  int world_me=BORNCLMF.device->world_me();
  int gpu_rank=BORNCLMF.device->gpu_rank();
  int procs_per_gpu=BORNCLMF.device->procs_per_gpu();
  BORNCLMF.device->init_message(screen,"born/coul/long",first_gpu,last_gpu);
  bool message=false;
  if (BORNCLMF.device->replica_me()==0 && screen)
    message=true;
  if (message) {
    fprintf(screen,"Initializing GPU and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
    init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
                          host_born3, host_a, host_c, host_d, sigma, offset, 
                          special_lj, inum, nall, 300, maxspecial, cell_size, 
                          gpu_split, screen, host_cut_ljsq, host_cut_coulsq, 
                          host_special_coul, qqrd2e, g_ewald);
  BORNCLMF.device->world_barrier();
  if (message)
    fprintf(screen,"Done.\n");
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
      else
        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
                            host_born3, host_a, host_c, host_d, sigma, offset, 
                            special_lj, inum, nall, 300, maxspecial, cell_size, 
                            gpu_split, screen, host_cut_ljsq, host_cut_coulsq, 
                            host_special_coul, qqrd2e, g_ewald);
    BORNCLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
  if (init_ok==0)
    BORNCLMF.estimate_gpu_overhead();
  return init_ok;
 }
 void borncl_gpu_clear() {
  BORNCLMF.clear();
 }
 int** borncl_gpu_compute_n(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           double *sublo, double *subhi, int *tag, int **nspecial, 
                           int **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum,  const double cpu_time,
                           bool &success, double *host_q, double *boxlo,
                           double *prd) {
  return BORNCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                          subhi, tag, nspecial, special, eflag, vflag, eatom,
                          vatom, host_start, ilist, jnum, cpu_time, success,
                          host_q, boxlo, prd);
 }  
 void borncl_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        const double cpu_time, bool &success, double *host_q,
                        const int nlocal, double *boxlo, double *prd) {
  BORNCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                   firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
                   host_q,nlocal,boxlo,prd);
 }
 double borncl_gpu_bytes() {
  return BORNCLMF.host_memory_usage();
 }
--- a/lib/gpu/lal_born_coul_wolf.cpp
+++ b/lib/gpu/lal_born_coul_wolf.cpp
@ -0,0 +1,176 @@
 /***************************************************************************
                             born_coul_wolf.cpp
                             -------------------
                           Trung Dac Nguyen (ORNL)
  Class for acceleration of the born/coul/wolf pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifdef USE_OPENCL
 #include "born_coul_wolf_cl.h"
 #elif defined(USE_CUDART)
 const char *born_coul_wolf=0;
 #else
 #include "born_coul_wolf_cubin.h"
 #endif
 #include "lal_born_coul_wolf.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define BornCoulWolfT BornCoulWolf<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> device;
 template <class numtyp, class acctyp>
 BornCoulWolfT::BornCoulWolf() : BaseCharge<numtyp,acctyp>(),
                                    _allocated(false) {
 }
 template <class numtyp, class acctyp>
 BornCoulWolfT::~BornCoulWolfT() {
  clear();
 }
 template <class numtyp, class acctyp>
 int BornCoulWolfT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }
 template <class numtyp, class acctyp>
 int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
                        double **host_born1, double **host_born2, double **host_born3, 
                        double **host_a, double **host_c, double **host_d, 
                        double **host_sigma, double **host_offset, 
                        double *host_special_lj, const int nlocal,
                        const int nall, const int max_nbors,
                        const int maxspecial, const double cell_size,
                        const double gpu_split, FILE *_screen,
                        double **host_cut_ljsq, const double host_cut_coulsq,
                        double *host_special_coul, const double qqrd2e,
                        const double alf, const double e_shift, const double f_shift) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,born_coul_wolf,"k_born_wolf");
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_OPTIMIZED);
  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
  coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv,
                         host_born1,host_born2,host_born3);
  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
 		                     host_d,host_offset);
  cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
                         host_cut_ljsq,host_sigma);
  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
    host_write[i]=host_special_lj[i];
    host_write[i+4]=host_special_coul[i];
  }
  ucl_copy(sp_lj,host_write,8,false);
  _cut_coulsq=host_cut_coulsq;
  _qqrd2e=qqrd2e;
  _alf=alf;
  _e_shift=e_shift;
  _f_shift=f_shift;
  _allocated=true;
  this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()
      +cutsq_sigma.row_bytes()+sp_lj.row_bytes();
  return 0;
 }
 template <class numtyp, class acctyp>
 void BornCoulWolfT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
  coeff1.clear();
  coeff2.clear();
  cutsq_sigma.clear();
  sp_lj.clear();
  this->clear_atomic();
 }
 template <class numtyp, class acctyp>
 double BornCoulWolfT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(BornCoulWolf<numtyp,acctyp>);
 }
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int eflag, vflag;
  if (_eflag)
    eflag=1;
  else
    eflag=0;
  if (_vflag)
    vflag=1;
  else
    vflag=0;
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->atom->q,
                          &cutsq_sigma, &_cut_coulsq, &_qqrd2e, 
                          &_alf, &_e_shift, &_f_shift, 
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, 
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
                   &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                   &nbor_pitch, &this->atom->q,
                   &cutsq_sigma, &_cut_coulsq,
                   &_qqrd2e, &_alf, &_e_shift, &_f_shift, 
                   &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
 template class BornCoulWolf<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@ -0,0 +1,282 @@
 // **************************************************************************
 //                              born_coul_wolf.cu
 //                             -------------------
 //                           Trung Dac Nguyen (ORNL)
 //
 //  Device code for acceleration of the born/coul/wolf pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
 #else
 texture<int4,1> pos_tex;
 texture<int2> q_tex;
 #endif
 #else
 #define pos_tex x_
 #define q_tex q_
 #endif
 #define MY_PIS (acctyp)1.77245385090551602729
 __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
                          __global numtyp4* coeff2, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
                          __global acctyp *engv, const int eflag, 
                          const int vflag, const int inum,
                          const int nbor_pitch, __global numtyp *q_,
                          __global numtyp4 *cutsq_sigma, 
                          const numtyp cut_coulsq, const numtyp qqrd2e,
                          const numtyp alf, const numtyp e_shift, 
                          const numtyp f_shift, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp sp_lj[8];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];
  sp_lj[4]=sp_lj_in[4];
  sp_lj[5]=sp_lj_in[5];
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;
    if (eflag>0) {
      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
      e_coul += (acctyp)2.0*e_self;
    }
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      int mtype=itype*lj_types+jtype;
      if (rsq<cutsq_sigma[mtype].x) { // cutsq
        numtyp r2inv = ucl_recip(rsq);
        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
        numtyp v_sh = (numtyp)0.0;
        numtyp rexp = (numtyp)0.0;
        if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
          numtyp r = ucl_sqrt(rsq);
          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
          r6inv = r2inv*r2inv*r2inv;
          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
        } else forceborn = (numtyp)0.0;
        if (rsq < cut_coulsq) {
          numtyp r=ucl_rsqrt(r2inv);
          numtyp arij = alf * r;
          numtyp expm2 = ucl_exp(-arij*arij);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*arij);
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
          fetch(prefactor,j,q_tex);
          prefactor *= qqrd2e * qtmp/r;
          v_sh = (_erfc - e_shift*r)*prefactor; 
          numtyp dvdrr = (_erfc/rsq + EWALD_F*alf*expm2/r) + f_shift;
          forcecoul = prefactor * (dvdrr*rsq-factor_coul);
        } else forcecoul = (numtyp)0.0;
        force = (forceborn + forcecoul) * r2inv;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          if (rsq < cut_coulsq)
            e_coul += prefactor*(v_sh-factor_coul);
          if (rsq < coeff1[mtype].w) {
            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
              + coeff2[mtype].z*r2inv*r6inv;
            energy+=factor_lj*(e-coeff2[mtype].w);
          } 
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                    vflag,ans,engv);
  } // if ii
 }
 __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
                               __global numtyp4* coeff2_in, 
                               __global numtyp* sp_lj_in,
                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
                               const int nbor_pitch, __global numtyp *q_,
                               __global numtyp4 *cutsq_sigma,
                               const numtyp cut_coulsq, const numtyp qqrd2e,
                               const numtyp alf, const numtyp e_shift, 
                               const numtyp f_shift, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
  if (tid<8)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    coeff1[tid]=coeff1_in[tid];
    if (eflag>0)
      coeff2[tid]=coeff2_in[tid];
  }
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    if (eflag>0) {
      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
      e_coul += (acctyp)2.0*e_self;
    }
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      if (rsq<cutsq_sigma[mtype].x) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
        numtyp v_sh = (numtyp)0.0;
        numtyp rexp = (numtyp)0.0;
        if (rsq < coeff1[mtype].w) {
          numtyp r = ucl_sqrt(rsq);
          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
          r6inv = r2inv*r2inv*r2inv;
          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
        } else forceborn = (numtyp)0.0;
        if (rsq < cut_coulsq) {
          numtyp r=ucl_rsqrt(r2inv);
          numtyp arij = alf * r;
          numtyp expm2 = ucl_exp(-arij*arij);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*arij);
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
          fetch(prefactor,j,q_tex);
          prefactor *= qqrd2e * qtmp/r;
          v_sh = (_erfc - e_shift*r)*prefactor; 
          numtyp dvdrr = (_erfc/rsq + EWALD_F*alf*expm2/r) + f_shift;
          forcecoul = prefactor * (dvdrr*rsq-factor_coul);
        } else forcecoul = (numtyp)0.0;
        force = (forceborn + forcecoul) * r2inv;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          if (rsq < cut_coulsq)
            e_coul += prefactor*(v_sh-factor_coul);
          if (rsq < coeff1[mtype].w) {
            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
              + coeff2[mtype].z*r2inv*r6inv;
            energy+=factor_lj*(e-coeff2[mtype].w);
          }
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                    vflag,ans,engv);
  } // if ii
 }
--- a/lib/gpu/lal_born_coul_wolf.h
+++ b/lib/gpu/lal_born_coul_wolf.h
@ -0,0 +1,89 @@
 /***************************************************************************
                              born_coul_wolf.h
                             -------------------
                           Trung Dac Nguyen (ORNL)
  Class for acceleration of the born/coul/wolf pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifndef LAL_BORN_COUL_LONG_H
 #define LAL_BORN_COUL_LONG_H
 #include "lal_base_charge.h"
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
 public:
  BornCoulWolf();
  ~BornCoulWolf();
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
           double **host_born1, double **host_born2, double **host_born3, 
           double **host_a, double **host_c, double **host_d, 
           double **host_sigma, double **host_offset, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size, 
           const double gpu_split, FILE *screen, double **host_cut_ljsq,
           const double host_cut_coulsq, double *host_special_coul,
           const double qqrd2e, const double alf, const double e_shift,
           const double f_shift);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage() const;
  // --------------------------- TYPE DATA --------------------------
  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, 
  /// coeff1.w = born3
  UCL_D_Vec<numtyp4> coeff1;
  /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
  UCL_D_Vec<numtyp4> coeff2;
  /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, 
  /// cutsq_sigma.z = sigma
  UCL_D_Vec<numtyp4> cutsq_sigma;
  /// Special LJ values [0-3] and Special Coul values [4-7]
  UCL_D_Vec<numtyp> sp_lj;
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
  /// Number of atom types 
  int _lj_types;
  numtyp _cut_coulsq,_qqrd2e,_alf,_e_shift,_f_shift;
 private:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
 };
 }
 #endif
--- a/lib/gpu/lal_born_coul_wolf_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_ext.cpp
@ -0,0 +1,134 @@
 /***************************************************************************
                           born_coul_wolf_ext.cpp
                             -------------------
                           Trung Dac Nguyen (ORNL)
  Functions for LAMMPS access to born/coul/wolf acceleration routines.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #include <iostream>
 #include <cassert>
 #include <math.h>
 #include "lal_born_coul_wolf.h"
 using namespace std;
 using namespace LAMMPS_AL;
 static BornCoulWolf<PRECISION,ACC_PRECISION> BORNCWMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                    double **host_born1, double **host_born2, double **host_born3, 
                    double **host_a, double **host_c, double **host_d,
                    double **sigma, double **offset, double *special_lj, const int inum,
                    const int nall, const int max_nbors, const int maxspecial,
                    const double cell_size, int &gpu_mode, FILE *screen,
                    double **host_cut_ljsq, double host_cut_coulsq,
                    double *host_special_coul, const double qqrd2e,
                    const double alf, const double e_shift, const double f_shift) {
  BORNCWMF.clear();
  gpu_mode=BORNCWMF.device->gpu_mode();
  double gpu_split=BORNCWMF.device->particle_split();
  int first_gpu=BORNCWMF.device->first_device();
  int last_gpu=BORNCWMF.device->last_device();
  int world_me=BORNCWMF.device->world_me();
  int gpu_rank=BORNCWMF.device->gpu_rank();
  int procs_per_gpu=BORNCWMF.device->procs_per_gpu();
  BORNCWMF.device->init_message(screen,"born/coul/wolf",first_gpu,last_gpu);
  bool message=false;
  if (BORNCWMF.device->replica_me()==0 && screen)
    message=true;
  if (message) {
    fprintf(screen,"Initializing GPU and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
    init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                          host_born3, host_a, host_c, host_d, sigma,
                          offset, special_lj, inum, nall, 300, 
                          maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e, 
                          alf, e_shift, f_shift);
  BORNCWMF.device->world_barrier();
  if (message)
    fprintf(screen,"Done.\n");
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
      else
        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
                            host_born3, host_a, host_c, host_d, sigma, 
                            offset, special_lj, inum, nall, 300, 
                            maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                            host_cut_coulsq, host_special_coul, qqrd2e, 
                            alf, e_shift, f_shift);
    BORNCWMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
  if (init_ok==0)
    BORNCWMF.estimate_gpu_overhead();
  return init_ok;
 }
 void borncw_gpu_clear() {
  BORNCWMF.clear();
 }
 int** borncw_gpu_compute_n(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           double *sublo, double *subhi, int *tag, int **nspecial, 
                           int **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum,  const double cpu_time,
                           bool &success, double *host_q, double *boxlo,
                           double *prd) {
  return BORNCWMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                          subhi, tag, nspecial, special, eflag, vflag, eatom,
                          vatom, host_start, ilist, jnum, cpu_time, success,
                          host_q, boxlo, prd);
 }  
 void borncw_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        const double cpu_time, bool &success, double *host_q,
                        const int nlocal, double *boxlo, double *prd) {
  BORNCWMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                   firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
                   host_q,nlocal,boxlo,prd);
 }
 double borncw_gpu_bytes() {
  return BORNCWMF.host_memory_usage();
 }
--- a/lib/gpu/lal_born_ext.cpp
+++ b/lib/gpu/lal_born_ext.cpp
@ -0,0 +1,124 @@
 /***************************************************************************
                                 born_ext.cpp
                             -------------------
                           Trung Dac Nguyen (ORNL)
  Functions for LAMMPS access to born acceleration routines.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #include <iostream>
 #include <cassert>
 #include <math.h>
 #include "lal_born.h"
 using namespace std;
 using namespace LAMMPS_AL;
 static Born<PRECISION,ACC_PRECISION> BORNMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                  double **host_born1, double **host_born2, 
                  double **host_born3, double **host_a, double **host_c, 
                  double **host_d, double **sigma,      
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors,  const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen) {
  BORNMF.clear();
  gpu_mode=BORNMF.device->gpu_mode();
  double gpu_split=BORNMF.device->particle_split();
  int first_gpu=BORNMF.device->first_device();
  int last_gpu=BORNMF.device->last_device();
  int world_me=BORNMF.device->world_me();
  int gpu_rank=BORNMF.device->gpu_rank();
  int procs_per_gpu=BORNMF.device->procs_per_gpu();
  BORNMF.device->init_message(screen,"born",first_gpu,last_gpu);
  bool message=false;
  if (BORNMF.device->replica_me()==0 && screen)
    message=true;
  if (message) {
    fprintf(screen,"Initializing GPU and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
    init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
                        host_born3, host_a, host_c, host_d, sigma,
                        offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
  BORNMF.device->world_barrier();
  if (message)
    fprintf(screen,"Done.\n");
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
      else
        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
                          host_born3, host_a, host_c, host_d, sigma, 
                          offset, special_lj, inum, nall, 300,
                          maxspecial, cell_size, gpu_split, screen);
    BORNMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
  if (init_ok==0)
    BORNMF.estimate_gpu_overhead();
  return init_ok;
 }
 void born_gpu_clear() {
  BORNMF.clear(); 
 }
 int ** born_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial,
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
                          bool &success) {
  return BORNMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 void born_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success) {
  BORNMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 double born_gpu_bytes() {
  return BORNMF.host_memory_usage();
 }
--- a/lib/gpu/lal_colloid.cpp
+++ b/lib/gpu/lal_colloid.cpp
@ -0,0 +1,181 @@
 /***************************************************************************
                                   colloid.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the colloid pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifdef USE_OPENCL
 #include "colloid_cl.h"
 #elif defined(USE_CUDART)
 const char *colloid=0;
 #else
 #include "colloid_cubin.h"
 #endif
 #include "lal_colloid.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define ColloidT Colloid<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> device;
 template <class numtyp, class acctyp>
 ColloidT::Colloid() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 template <class numtyp, class acctyp>
 ColloidT::~Colloid() { 
  clear();
 }
 template <class numtyp, class acctyp>
 int ColloidT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }
 template <class numtyp, class acctyp>
 int ColloidT::init(const int ntypes, 
                   double **host_cutsq, double **host_lj1, 
                   double **host_lj2, double **host_lj3, 
                   double **host_lj4, double **host_offset, 
                   double *host_special_lj, double **host_a12, 
                   double **host_a1, double **host_a2, 
                   double **host_d1, double **host_d2, 
                   double **host_sigma3, double **host_sigma6,
                   int **host_form, const int nlocal,
                   const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size,
                   const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,colloid,"k_colloid");
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_OPTIMIZED);
  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
                         host_cutsq);
  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
                         host_offset);
  colloid1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,colloid1,host_write,host_a12,host_a1,
                         host_a2);
  colloid2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,colloid2,host_write,host_d1,host_d2,
                         host_sigma3,host_sigma6);
  UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device),
                             UCL_WRITE_OPTIMIZED);
  for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0;
  form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<ntypes; i++)
    for (int j=0; j<ntypes; j++) {
      dview_form[i*lj_types+j]=host_form[i][j];
  }
  ucl_copy(form,dview_form,false);
  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
  dview.view(host_special_lj,4,*(this->ucl_device));
  ucl_copy(sp_lj,dview,false);
  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()
        +colloid1.row_bytes()+colloid2.row_bytes()+sp_lj.row_bytes();
  return 0;
 }
 template <class numtyp, class acctyp>
 void ColloidT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
  lj1.clear();
  lj3.clear();
  colloid1.clear();
  colloid2.clear();
  form.clear();
  sp_lj.clear();
  this->clear_atomic();
 }
 template <class numtyp, class acctyp>
 double ColloidT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(Colloid<numtyp,acctyp>);
 }
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void ColloidT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int eflag, vflag;
  if (_eflag)
    eflag=1;
  else
    eflag=0;
  if (_vflag)
    vflag=1;
  else
    vflag=0;
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                          &colloid1, &colloid2, &form,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
                     &colloid1, &colloid2, &form, 
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
                     &this->ans->force, &this->ans->engv, &eflag, &vflag, 
                     &ainum, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
 template class Colloid<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@ -0,0 +1,329 @@
 // **************************************************************************
 //                                   colloid.cu
 //                             -------------------
 //                           Trung Dac Nguyen (ORNL)
 //
 //  Device code for acceleration of the colloid pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 #else
 texture<int4,1> pos_tex;
 #endif
 #else
 #define pos_tex x_
 #endif
 __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
                        __global numtyp4* lj3, const int lj_types, 
                        __global numtyp *sp_lj_in, 
                        __global numtyp4* colloid1, 
                        __global numtyp4* colloid2,
                        __global int *form, 
                        __global int *dev_nbor, 
                        __global int *dev_packed, __global acctyp4 *ans,
                        __global acctyp *engv, const int eflag, 
                        const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp sp_lj[4];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    numtyp factor_lj;
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      int mtype=itype*lj_types+jtype;
      if (rsq<lj1[mtype].z) {   
        numtyp r,r2inv,r6inv;
        numtyp c1,c2,fR,evdwl;
        numtyp K[9],h[4],g[4];
        numtyp force = (numtyp)0;
        if (form[mtype]==0) { // SMALL_SMALL
          r2inv=ucl_recip(rsq);
          r6inv = r2inv*r2inv*r2inv;
          force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
          force*=factor_lj;
        } else if (form[mtype]==1) { // SMALL_LARGE
          c2 = colloid1[mtype].z; 
          K[1] = c2*c2;
          K[2] = rsq;
          K[0] = K[1] - rsq;
          K[4] = rsq*rsq;
          K[3] = K[1] - K[2];
          K[3] *= K[3]*K[3];
          K[6] = K[3]*K[3];
          fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
          force = 4.0/15.0*fR * 
            (2.0*(K[1]+K[2]) * (K[1]*(5.0*K[1]+22.0*K[2])+5.0*K[4]) * 
          colloid2[mtype].w/K[6]-5.0) / K[0];
          force*=factor_lj;
        } else if (form[mtype]==2) { // LARGE_LARGE
          r = ucl_sqrt(rsq);
          c1 = colloid1[mtype].y; 
          c2 = colloid1[mtype].z; 
          K[0] = c1*c2;
          K[1] = c1+c2;
          K[2] = c1-c2;
          K[3] = K[1]+r;
          K[4] = K[1]-r;
          K[5] = K[2]+r;
          K[6] = K[2]-r;
          K[7] = ucl_recip(K[3]*K[4]);
          K[8] = ucl_recip(K[5]*K[6]);
          g[0] = ucl_powr(K[3],(numtyp)-7.0);
          g[1] = -ucl_powr(-K[4],(numtyp)-7.0);
          g[2] = ucl_powr(K[5],(numtyp)-7.0);
          g[3] = -ucl_powr(-K[6],(numtyp)-7.0);
          h[0] = ((K[3]+(numtyp)5.0*K[1])*K[3]+(numtyp)30.0*K[0])*g[0];
          h[1] = ((K[4]+(numtyp)5.0*K[1])*K[4]+(numtyp)30.0*K[0])*g[1];
          h[2] = ((K[5]+(numtyp)5.0*K[2])*K[5]-(numtyp)30.0*K[0])*g[2];
          h[3] = ((K[6]+(numtyp)5.0*K[2])*K[6]-(numtyp)30.0*K[0])*g[3];
          g[0] *= (numtyp)42.0*K[0]/K[3]+(numtyp)6.0*K[1]+K[3];
          g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4];
          g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5];
          g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6];
          fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
          evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
          numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
          numtyp dUA = -colloid1[mtype].x/3.0*r*(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
            ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
          force = factor_lj * (dUR+dUA)/r;
        }
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          numtyp e=(numtyp)0.0;
          if (form[mtype]==0) {
            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
          } else if (form[mtype]==1) {
            e=(numtyp)2.0/(numtyp)9.0*fR * 
              ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) +  
              (numtyp)4.2*K[4])+K[2]*K[4]) * colloid2[mtype].w/K[6]);
          } else if (form[mtype]==2) {
            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
          } 
          energy+=factor_lj*(e-lj3[mtype].z); 
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                  ans,engv);
  } // if ii
 }
 __kernel void k_colloid_fast(__global numtyp4 *x_, 
                             __global numtyp4 *lj1_in,
                             __global numtyp4 *lj3_in, 
                             __global numtyp *sp_lj_in,
                             __global numtyp4 *colloid1_in, 
                             __global numtyp4 *colloid2_in,
                             __global int *form_in, 
                             __global int *dev_nbor, __global int *dev_packed, 
                             __global acctyp4 *ans, __global acctyp *engv, 
                             const int eflag, const int vflag, const int inum, 
                             const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 colloid1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 colloid2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
  if (tid<4)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    lj1[tid]=lj1_in[tid];
    colloid1[tid]=colloid1_in[tid];
    colloid2[tid]=colloid2_in[tid];
    form[tid]=form_in[tid];
    if (eflag>0)
      lj3[tid]=lj3_in[tid];
  }
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    numtyp factor_lj;
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      if (rsq<lj1[mtype].z) {
        numtyp r,r2inv,r6inv;
        numtyp c1,c2,fR,evdwl;
        numtyp K[9],h[4],g[4];
        numtyp force = (numtyp)0;
        if (form[mtype]==0) { // SMALL_SMALL
          r2inv=ucl_recip(rsq);
          r6inv = r2inv*r2inv*r2inv;
          force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
          force*=factor_lj;
        } else if (form[mtype]==1) { // SMALL_LARGE
          c2 = colloid1[mtype].z; 
          K[1] = c2*c2;
          K[2] = rsq;
          K[0] = K[1] - rsq;
          K[4] = rsq*rsq;
          K[3] = K[1] - K[2];
          K[3] *= K[3]*K[3];
          K[6] = K[3]*K[3];
          fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
          force = (numtyp)4.0/(numtyp)15.0*fR * 
            ((numtyp)2.0*(K[1]+K[2]) * (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) * 
            colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
          force*=factor_lj;
        } else if (form[mtype]==2) { // LARGE_LARGE
          r = ucl_sqrt(rsq);
          c1 = colloid1[mtype].y; 
          c2 = colloid1[mtype].z; 
          K[0] = c1*c2;
          K[1] = c1+c2;
          K[2] = c1-c2;
          K[3] = K[1]+r;
          K[4] = K[1]-r;
          K[5] = K[2]+r;
          K[6] = K[2]-r;
          K[7] = ucl_recip(K[3]*K[4]);
          K[8] = ucl_recip(K[5]*K[6]);
          g[0] = ucl_powr(K[3],(numtyp)-7.0);
          g[1] = -ucl_powr(-K[4],(numtyp)-7.0);
          g[2] = ucl_powr(K[5],(numtyp)-7.0);
          g[3] = -ucl_powr(-K[6],(numtyp)-7.0);
          h[0] = ((K[3]+(numtyp)5.0*K[1])*K[3]+(numtyp)30.0*K[0])*g[0];
          h[1] = ((K[4]+(numtyp)5.0*K[1])*K[4]+(numtyp)30.0*K[0])*g[1];
          h[2] = ((K[5]+(numtyp)5.0*K[2])*K[5]-(numtyp)30.0*K[0])*g[2];
          h[3] = ((K[6]+(numtyp)5.0*K[2])*K[6]-(numtyp)30.0*K[0])*g[3];
          g[0] *= (numtyp)42.0*K[0]/K[3]+(numtyp)6.0*K[1]+K[3];
          g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4];
          g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5];
          g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6];
          fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
          evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
          numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
          numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
            ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
          force = factor_lj * (dUR+dUA)/r;
        } else force = (numtyp)0.0;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          numtyp e=(numtyp)0.0;
          if (form[mtype]==0) {
            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
          } else if (form[mtype]==1) {
            e=(numtyp)2.0/(numtyp)9.0*fR * 
              ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])*
             colloid2[mtype].w/K[6]);
          } else if (form[mtype]==2) {
            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
          } 
          energy+=factor_lj*(e-lj3[mtype].z); 
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                  ans,engv);
  } // if ii
 }
--- a/lib/gpu/lal_colloid.h
+++ b/lib/gpu/lal_colloid.h
@ -0,0 +1,89 @@
 /***************************************************************************
                                    colloid.h
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the colloid pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifndef LAL_COLLOID_H
 #define LAL_COLLOID_H
 #include "lal_base_atomic.h"
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class Colloid : public BaseAtomic<numtyp, acctyp> {
 public:
  Colloid();
  ~Colloid(); 
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq,
           double **host_lj1, double **host_lj2, double **host_lj3,
           double **host_lj4, double **host_offset, double *host_special_lj,
           double **host_a12, double **host_a1, double **host_a2, 
           double **host_d1, double **host_d2, double **host_sigma3, 
           double **host_sigma6, int **host_form, 
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size, 
           const double gpu_split, FILE *screen);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage() const;
  // --------------------------- TYPE DATA --------------------------
  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq
  UCL_D_Vec<numtyp4> lj1;
  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
  UCL_D_Vec<numtyp4> lj3;
  /// colloid1.x = a12, colloid1.y = a1, colloid1.z = a2
  UCL_D_Vec<numtyp4> colloid1;
  /// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3, 
  /// colloid2.w = sigma6
  UCL_D_Vec<numtyp4> colloid2;
  /// form
  UCL_D_Vec<int> form;
  /// Special LJ values
  UCL_D_Vec<numtyp> sp_lj;
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
  /// Number of atom types 
  int _lj_types;
 private:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
 };
 }
 #endif
--- a/lib/gpu/lal_colloid_ext.cpp
+++ b/lib/gpu/lal_colloid_ext.cpp
@ -0,0 +1,127 @@
 /***************************************************************************
                                 colloid_ext.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Functions for LAMMPS access to colloid acceleration routines.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #include <iostream>
 #include <cassert>
 #include <math.h>
 #include "lal_colloid.h"
 using namespace std;
 using namespace LAMMPS_AL;
 static Colloid<PRECISION,ACC_PRECISION> COLLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                     double **host_lj2, double **host_lj3, double **host_lj4,
                     double **offset, double *special_lj, 
                     double **host_a12, double **host_a1, double **host_a2, 
                     double **host_d1, double **host_d2, double **host_sigma3, 
                     double **host_sigma6, int **host_form, const int inum,
                     const int nall, const int max_nbors,  const int maxspecial,
                     const double cell_size, int &gpu_mode, FILE *screen) {
  COLLMF.clear();
  gpu_mode=COLLMF.device->gpu_mode();
  double gpu_split=COLLMF.device->particle_split();
  int first_gpu=COLLMF.device->first_device();
  int last_gpu=COLLMF.device->last_device();
  int world_me=COLLMF.device->world_me();
  int gpu_rank=COLLMF.device->gpu_rank();
  int procs_per_gpu=COLLMF.device->procs_per_gpu();
  COLLMF.device->init_message(screen,"colloid",first_gpu,last_gpu);
  bool message=false;
  if (COLLMF.device->replica_me()==0 && screen)
    message=true;
  if (message) {
    fprintf(screen,"Initializing GPU and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
    init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
                        host_lj4, offset, special_lj, host_a12, host_a1,
                        host_a2, host_d1, host_d2, host_sigma3, 
                        host_sigma6, host_form, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
  COLLMF.device->world_barrier();
  if (message)
    fprintf(screen,"Done.\n");
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
      else
        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                          offset, special_lj, host_a12, host_a1, host_a2, 
                          host_d1, host_d2, host_sigma3, host_sigma6, host_form, 
                          inum, nall, 300, maxspecial,
                          cell_size, gpu_split, screen);
    COLLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
  if (init_ok==0)
    COLLMF.estimate_gpu_overhead();
  return init_ok;
 }
 void colloid_gpu_clear() {
  COLLMF.clear();
 }
 int ** colloid_gpu_compute_n(const int ago, const int inum_full,
                             const int nall, double **host_x, int *host_type,
                             double *sublo, double *subhi, int *tag, int **nspecial,
                             int **special, const bool eflag, const bool vflag,
                             const bool eatom, const bool vatom, int &host_start,
                             int **ilist, int **jnum, const double cpu_time,
                             bool &success) {
  return COLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 void colloid_gpu_compute(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, int *ilist, int *numj,
                         int **firstneigh, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         const double cpu_time, bool &success) {
  COLLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 double colloid_gpu_bytes() {
  return COLLMF.host_memory_usage();
 }
--- a/lib/gpu/lal_coul_dsf.cpp
+++ b/lib/gpu/lal_coul_dsf.cpp
@ -0,0 +1,153 @@
 /***************************************************************************
                                  coul_dsf.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the coul/dsf pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 8/15/2012
    email                : nguyentdw@ornl.gov
 ***************************************************************************/
 #if defined(USE_OPENCL)
 #include "coul_dsf_cl.h"
 #elif defined(USE_CUDART)
 const char *coul_dsf=0;
 #else
 #include "coul_dsf_cubin.h"
 #endif
 #include "lal_coul_dsf.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define CoulDSFT CoulDSF<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> device;
 template <class numtyp, class acctyp>
 CoulDSFT::CoulDSF() : BaseCharge<numtyp,acctyp>(),
                                    _allocated(false) {
 }
 template <class numtyp, class acctyp>
 CoulDSFT::~CoulDSF() {
  clear();
 }
 template <class numtyp, class acctyp>
 int CoulDSFT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }
 template <class numtyp, class acctyp>
 int CoulDSFT::init(const int ntypes, const int nlocal, const int nall, 
                   const int max_nbors, const int maxspecial, 
                   const double cell_size, const double gpu_split, FILE *_screen,
                   const double host_cut_coulsq, double *host_special_coul,
                   const double qqrd2e, const double e_shift, const double f_shift, 
                   const double alpha) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,coul_dsf,"k_coul_dsf");
  if (success!=0)
    return success;
  _cut_coulsq=host_cut_coulsq;
  _e_shift=e_shift;
  _f_shift=f_shift;
  _alpha=alpha;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_OPTIMIZED);
  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
    host_write[i]=host_special_coul[i];
  }
  ucl_copy(sp_lj,host_write,4,false);
  _qqrd2e=qqrd2e;
  _allocated=true;
  this->_max_bytes=sp_lj.row_bytes();
  return 0;
 }
 template <class numtyp, class acctyp>
 void CoulDSFT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
  sp_lj.clear();
  this->clear_atomic();
 }
 template <class numtyp, class acctyp>
 double CoulDSFT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(CoulDSF<numtyp,acctyp>);
 }
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int eflag, vflag;
  if (_eflag)
    eflag=1;
  else
    eflag=0;
  if (_vflag)
    vflag=1;
  else
    vflag=0;
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, 
                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
                          &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj, 
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
                     &this->ans->force, &this->ans->engv, 
                     &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
                     &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
                     &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
 template class CoulDSF<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@ -0,0 +1,214 @@
 // **************************************************************************
 //                                coul_dsf.cu
 //                             -------------------
 //                           Trung Dac Nguyen (ORNL)
 //
 //  Device code for acceleration of the coul/dsf pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 8/15/2012
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
 #else
 texture<int4,1> pos_tex;
 texture<int2> q_tex;
 #endif
 #else
 #define pos_tex x_
 #define q_tex q_
 #endif
 #define MY_PIS (acctyp)1.77245385090551602729
 __kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types, 
                         __global numtyp *sp_lj_in, __global int *dev_nbor, 
                         __global int *dev_packed, __global acctyp4 *ans,
                         __global acctyp *engv, const int eflag,
                         const int vflag, const int inum,
                         const int nbor_pitch, __global numtyp *q_ ,
                         const numtyp cut_coulsq, const numtyp qqrd2e,
                         const numtyp e_shift, const numtyp f_shift, 
                         const numtyp alpha, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp sp_lj[4];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_coul, r, prefactor, erfcc;
      factor_coul = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      if (rsq < cut_coulsq) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp forcecoul, force;
        r = ucl_sqrt(rsq);
        fetch(prefactor,j,q_tex);
        prefactor *= factor_coul * qqrd2e*qtmp/r;
        numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
        numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
        erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
        forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd + 
                                 rsq*f_shift);
        force = forcecoul * r2inv;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          if (rsq < cut_coulsq) {
            numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift);
            e_coul += e;
          }
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                    vflag,ans,engv);
  } // if ii
 }
 __kernel void k_coul_dsf_fast(__global numtyp4 *x_, __global numtyp* sp_lj_in,
                              __global int *dev_nbor, __global int *dev_packed,
                              __global acctyp4 *ans, __global acctyp *engv, 
                              const int eflag, const int vflag, const int inum, 
                              const int nbor_pitch, __global numtyp *q_,
                              const numtyp cut_coulsq, const numtyp qqrd2e,
                              const numtyp e_shift, const numtyp f_shift, 
                              const numtyp alpha, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp sp_lj[4];
  if (tid<4)
    sp_lj[tid]=sp_lj_in[tid];
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_coul, r, prefactor, erfcc;
      factor_coul = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      if (rsq < cut_coulsq) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp forcecoul, force;
        r = ucl_sqrt(rsq);
        fetch(prefactor,j,q_tex);
        prefactor *= factor_coul * qqrd2e*qtmp/r;
        numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
        numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
        erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
        forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd + 
                                   rsq*f_shift);
        force = forcecoul * r2inv;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          if (rsq < cut_coulsq) {
            numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift);
            e_coul += e;
          }
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                    vflag,ans,engv);
  } // if ii
 }
--- a/lib/gpu/lal_coul_dsf.h
+++ b/lib/gpu/lal_coul_dsf.h
@ -0,0 +1,78 @@
 /***************************************************************************
                                  coul_dsf.h
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the coul/dsf pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 8/15/2012
    email                : nguyentdw@ornl.gov
 ***************************************************************************/
 #ifndef LAL_LJ_DSF_H
 #define LAL_LJ_DSF_H
 #include "lal_base_charge.h"
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class CoulDSF : public BaseCharge<numtyp, acctyp> {
 public:
  CoulDSF();
  ~CoulDSF();
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, const int nlocal, const int nall, 
           const int max_nbors, const int maxspecial, 
           const double cell_size, const double gpu_split, FILE *screen,
           const double host_cut_coulsq, double *host_special_coul,
           const double qqrd2e, const double e_shift, const double f_shift, 
           const double alpha);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage() const;
  // --------------------------- TYPE DATA --------------------------
  /// Special LJ values [0-3] and Special Coul values [4-7]
  UCL_D_Vec<numtyp> sp_lj;
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
  /// Number of atom types 
  int _lj_types;
  numtyp _qqrd2e;
 private:
  bool _allocated;
  numtyp _e_shift, _f_shift, _alpha, _cut_coulsq;
  void loop(const bool _eflag, const bool _vflag);
 };
 }
 #endif
--- a/lib/gpu/lal_coul_dsf_ext.cpp
+++ b/lib/gpu/lal_coul_dsf_ext.cpp
@ -0,0 +1,125 @@
 /***************************************************************************
                               coul_dsf_ext.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Functions for LAMMPS access to coul/dsf acceleration routines.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 8/15/2012
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #include <iostream>
 #include <cassert>
 #include <math.h>
 #include "lal_coul_dsf.h"
 using namespace std;
 using namespace LAMMPS_AL;
 static CoulDSF<PRECISION,ACC_PRECISION> CDMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int cdsf_gpu_init(const int ntypes, const int inum, const int nall, 
                  const int max_nbors, const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen,
                  const double host_cut_coulsq, double *host_special_coul, 
                  const double qqrd2e, const double e_shift, const double f_shift, 
                  const double alpha) {
  CDMF.clear();
  gpu_mode=CDMF.device->gpu_mode();
  double gpu_split=CDMF.device->particle_split();
  int first_gpu=CDMF.device->first_device();
  int last_gpu=CDMF.device->last_device();
  int world_me=CDMF.device->world_me();
  int gpu_rank=CDMF.device->gpu_rank();
  int procs_per_gpu=CDMF.device->procs_per_gpu();
  CDMF.device->init_message(screen,"coul/dsf",first_gpu,last_gpu);
  bool message=false;
  if (CDMF.device->replica_me()==0 && screen)
    message=true;
  if (message) {
    fprintf(screen,"Initializing GPU and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
    init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, 
                      gpu_split, screen, host_cut_coulsq, host_special_coul, 
                      qqrd2e, e_shift, f_shift, alpha);
  CDMF.device->world_barrier();
  if (message)
    fprintf(screen,"Done.\n");
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
      else
        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, 
                        gpu_split, screen, host_cut_coulsq, host_special_coul, 
                        qqrd2e, e_shift, f_shift, alpha);
    CDMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
  if (init_ok==0)
    CDMF.estimate_gpu_overhead();
  return init_ok;
 }
 void cdsf_gpu_clear() {
  CDMF.clear();
 }
 int** cdsf_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, int *tag, int **nspecial, 
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success, double *host_q, double *boxlo,
                         double *prd) {
  return CDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                      subhi, tag, nspecial, special, eflag, vflag, eatom,
                      vatom, host_start, ilist, jnum, cpu_time, success,
                      host_q, boxlo, prd);
 }  
 void cdsf_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success, double *host_q,
                      const int nlocal, double *boxlo, double *prd) {
  CDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
               vflag,eatom,vatom,host_start,cpu_time,success,host_q,
               nlocal,boxlo,prd);
 }
 double cdsf_gpu_bytes() {
  return CDMF.host_memory_usage();
 }
--- a/lib/gpu/lal_dipole_lj.cpp
+++ b/lib/gpu/lal_dipole_lj.cpp
@ -0,0 +1,170 @@
 /***************************************************************************
                                 dipole_lj.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the dipole/cut pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifdef USE_OPENCL
 #include "dipole_lj_cl.h"
 #elif defined(USE_CUDART)
 const char *dipole_lj=0;
 #else
 #include "dipole_lj_cubin.h"
 #endif
 #include "lal_dipole_lj.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define DipoleLJT DipoleLJ<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> device;
 template <class numtyp, class acctyp>
 DipoleLJT::DipoleLJ() : BaseDipole<numtyp,acctyp>(),
                                    _allocated(false) {
 }
 template <class numtyp, class acctyp>
 DipoleLJT::~DipoleLJ() {
  clear();
 }
 template <class numtyp, class acctyp>
 int DipoleLJT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }
 template <class numtyp, class acctyp>
 int DipoleLJT::init(const int ntypes,
                    double **host_cutsq, double **host_lj1, 
                    double **host_lj2, double **host_lj3, 
                    double **host_lj4, double **host_offset,
                    double *host_special_lj, const int nlocal,
                    const int nall, const int max_nbors,
                    const int maxspecial, const double cell_size,
                    const double gpu_split, FILE *_screen,
                    double **host_cut_ljsq, double **host_cut_coulsq,
                    double *host_special_coul, const double qqrd2e) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,dipole_lj,"k_dipole_lj");
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_OPTIMIZED);
  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
                         host_cut_ljsq, host_cut_coulsq);
  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
                         host_offset);
  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
    host_write[i]=host_special_lj[i];
    host_write[i+4]=host_special_coul[i];
  }
  ucl_copy(sp_lj,host_write,8,false);
  _qqrd2e=qqrd2e;
  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
                   sp_lj.row_bytes();
  return 0;
 }
 template <class numtyp, class acctyp>
 void DipoleLJT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
  lj1.clear();
  lj3.clear();
  cutsq.clear();
  sp_lj.clear();
  this->clear_atomic();
 }
 template <class numtyp, class acctyp>
 double DipoleLJT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(DipoleLJ<numtyp,acctyp>);
 }
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void DipoleLJT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int eflag, vflag;
  if (_eflag)
    eflag=1;
  else
    eflag=0;
  if (_vflag)
    vflag=1;
  else
    vflag=0;
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                          &this->nbor->dev_nbor,
                          &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->atom->q, 
                          &this->atom->quat, &cutsq,
                          &_qqrd2e, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &lj1, &lj3,
                     &_lj_types, &sp_lj, &this->nbor->dev_nbor,
                     &this->_nbor_data->begin(), &this->ans->force,
                     &this->ans->engv, &eflag, &vflag, &ainum,
                     &nbor_pitch, &this->atom->q, 
                     &this->atom->quat, &cutsq, 
                     &_qqrd2e, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
 template class DipoleLJ<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@ -0,0 +1,501 @@
 // **************************************************************************
 //                                dipole_lj.cu
 //                             -------------------
 //                           Trung Dac Nguyen (ORNL)
 //
 //  Device code for acceleration of the dipole/cut pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 #define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid,      \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
  if (t_per_atom>1) {                                                       \
    __local acctyp red_acc[8][BLOCK_PAIR];                                  \
    red_acc[0][tid]=f.x;                                                    \
    red_acc[1][tid]=f.y;                                                    \
    red_acc[2][tid]=f.z;                                                    \
    red_acc[3][tid]=tor.x;                                                  \
    red_acc[4][tid]=tor.y;                                                  \
    red_acc[5][tid]=tor.z;                                                  \
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
      if (offset < s) {                                                     \
        for (int r=0; r<6; r++)                                             \
          red_acc[r][tid] += red_acc[r][tid+s];                             \
      }                                                                     \
    }                                                                       \
    f.x=red_acc[0][tid];                                                    \
    f.y=red_acc[1][tid];                                                    \
    f.z=red_acc[2][tid];                                                    \
    tor.x=red_acc[3][tid];                                                  \
    tor.y=red_acc[4][tid];                                                  \
    tor.z=red_acc[5][tid];                                                  \
    if (eflag>0 || vflag>0) {                                               \
      for (int r=0; r<6; r++)                                               \
        red_acc[r][tid]=virial[r];                                          \
      red_acc[6][tid]=energy;                                               \
      red_acc[7][tid]=ecoul;                                                \
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
        if (offset < s) {                                                   \
          for (int r=0; r<8; r++)                                           \
            red_acc[r][tid] += red_acc[r][tid+s];                           \
        }                                                                   \
      }                                                                     \
      for (int r=0; r<6; r++)                                               \
        virial[r]=red_acc[r][tid];                                          \
      energy=red_acc[6][tid];                                               \
      ecoul=red_acc[7][tid];                                                \
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
    engv+=ii;                                                               \
    if (eflag>0) {                                                          \
      *engv=energy;                                                         \
      engv+=inum;                                                           \
      *engv=e_coul;                                                         \
      engv+=inum;                                                           \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
        *engv=virial[i];                                                    \
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
    ans[ii]=f;                                                              \
    ans[ii+inum]=tor;                                                       \
  }
 #ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
 texture<float4> mu_tex;
 #else
 texture<int4,1> pos_tex;
 texture<int2> q_tex;
 texture<int4,1> mu_tex;
 #endif
 #else
 #define pos_tex x_
 #define q_tex q_
 #define mu_tex mu_
 #endif
 __kernel void k_dipole_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
                          __global acctyp *engv, const int eflag,
                          const int vflag, const int inum,
                          const int nbor_pitch, __global numtyp *q_ ,
                           __global numtyp4 *mu_,
                          __global numtyp *cutsq, const numtyp qqrd2e,
                          const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp sp_lj[8];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];
  sp_lj[4]=sp_lj_in[4];
  sp_lj[5]=sp_lj_in[5];
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp4 tor;
  tor.x=(acctyp)0;
  tor.y=(acctyp)0;
  tor.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
    int itype=ix.w;
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
      factor_coul = sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      numtyp qj; fetch(qj,j,q_tex);
      numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
      int jtype=jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      int mtype=itype*lj_types+jtype;
      if (rsq<cutsq[mtype]) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp force_lj, r6inv;
        numtyp rinv, r3inv, r5inv, r7inv;
        numtyp pre1, pre2, pre3, pre4;
        numtyp pdotp, pidotr, pjdotr;
        acctyp4 forcecoul, ticoul;
        acctyp4 force;
        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
        if (rsq < lj1[mtype].z) {
          r6inv = r2inv*r2inv*r2inv;
          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
        } else force_lj = (numtyp)0.0;
        if (rsq < lj1[mtype].w) {
          rinv = ucl_rsqrt(rsq);
          // charge-charge
          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) { 
            r3inv = r2inv*rinv;
            pre1 = qtmp*qj*r3inv;
            forcecoul.x += pre1*delx;
            forcecoul.y += pre1*dely;
            forcecoul.z += pre1*delz;
          }                    
          // dipole-dipole
          if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
            r3inv = r2inv*rinv; 
            r5inv = r3inv*r2inv;
 	          r7inv = r5inv*r2inv;
            pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
            pre1 = (numtyp)3.0*r5inv*pdotp - (numtyp)15.0*r7inv*pidotr*pjdotr;
            pre2 = (numtyp)3.0*r5inv*pjdotr;
            pre3 = (numtyp)3.0*r5inv*pidotr;
            pre4 = (numtyp)(-1.0)*r3inv;
            forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x;
            forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y;
            forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z;
            numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
            numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
            numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
            ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
            ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
            ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
          }
          // dipole-charge
          if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
            r3inv = r2inv*rinv; 
            r5inv = r3inv*r2inv;
            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
            pre1 = (numtyp)3.0*qj*r5inv * pidotr;
            pre2 = qj*r3inv;
            forcecoul.x += pre2*mui.x - pre1*delx;
            forcecoul.y += pre2*mui.y - pre1*dely;
            forcecoul.z += pre2*mui.z - pre1*delz;
            ticoul.x += pre2 * (mui.y*delz - mui.z*dely);
            ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
            ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
          }
          // charge-dipole
          if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
            r3inv = r2inv*rinv;
            r5inv = r3inv*r2inv;
            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
            pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr;
            pre2 = qtmp*r3inv;
            forcecoul.x += pre1*delx - pre2*muj.x;
            forcecoul.y += pre1*dely - pre2*muj.y;
            forcecoul.z += pre1*delz - pre2*muj.z;
          }
        } else {
          forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
          ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
        }
        numtyp fq = factor_coul*qqrd2e;
        force.x = fq*forcecoul.x + delx*force_lj;
        force.y = fq*forcecoul.y + dely*force_lj;
        force.z = fq*forcecoul.z + delz*force_lj;
        f.x+=force.x;
        f.y+=force.y;
        f.z+=force.z;
        tor.x+=fq*ticoul.x;
        tor.y+=fq*ticoul.y;
        tor.z+=fq*ticoul.z;
        if (eflag>0) {
          acctyp e = (acctyp)0.0;  
          if (rsq < lj1[mtype].w) {
            e = qtmp*qj*rinv;
            if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
              e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr;
            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) 
              e += -qj*r3inv*pidotr;
            if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
              e += qtmp*r3inv*pjdotr;
            e *= fq;
          } else e = (acctyp)0.0;
            e_coul += e;
          if (rsq < lj1[mtype].z) {
            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
            energy+=factor_lj*(e-lj3[mtype].z);
          } 
        }
        if (vflag>0) {
          virial[0] += delx*force.x;
          virial[1] += dely*force.y;
          virial[2] += delz*force.z;
          virial[3] += delx*force.y;
          virial[4] += delx*force.z;
          virial[5] += dely*force.z;
        }
      }
    } // for nbor
    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                    vflag,ans,engv);
  } // if ii
 }
 __kernel void k_dipole_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
                               __global numtyp* sp_lj_in,
                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
                               const int nbor_pitch, __global numtyp *q_,
                               __global numtyp4 *mu_,
                               __global numtyp *_cutsq, const numtyp qqrd2e,
                               const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
  if (tid<8)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    lj1[tid]=lj1_in[tid];
    cutsq[tid]=_cutsq[tid];
    if (eflag>0)
      lj3[tid]=lj3_in[tid];
  }
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp4 tor;
  tor.x=(acctyp)0;
  tor.y=(acctyp)0;
  tor.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
      factor_coul = sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      numtyp qj; fetch(qj,j,q_tex);
      numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
      int mtype=itype+jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      if (rsq<cutsq[mtype]) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp force_lj, r6inv;
        numtyp rinv, r3inv, r5inv, r7inv;
        numtyp pre1, pre2, pre3, pre4;
        numtyp pdotp, pidotr, pjdotr;
        acctyp4 forcecoul, ticoul;
        acctyp4 force;
        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
        if (rsq < lj1[mtype].z) {
          r6inv = r2inv*r2inv*r2inv;
          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
        } else force_lj = (numtyp)0.0;
        if (rsq < lj1[mtype].w) {
          rinv = ucl_rsqrt(rsq);
          // charge-charge
          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) { 
            r3inv = r2inv*rinv;
            pre1 = qtmp*qj*r3inv;
            forcecoul.x += pre1*delx;
            forcecoul.y += pre1*dely;
            forcecoul.z += pre1*delz;
          }
          // dipole-dipole
          if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
            r3inv = r2inv*rinv; 
            r5inv = r3inv*r2inv;
            r7inv = r5inv*r2inv;
            pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
            pre1 = (numtyp)3.0*r5inv*pdotp - (numtyp)15.0*r7inv*pidotr*pjdotr;
            pre2 = (numtyp)3.0*r5inv*pjdotr;
            pre3 = (numtyp)3.0*r5inv*pidotr;
            pre4 = (numtyp)(-1.0)*r3inv;
            forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x;
            forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y;
            forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z;
            numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
            numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
            numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
            ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
            ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
            ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
          }
          // dipole-charge
          if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { 
            r3inv = r2inv*rinv;
            r5inv = r3inv*r2inv;
            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
            pre1 = (numtyp)3.0*qj*r5inv * pidotr;
            pre2 = qj*r3inv;
            forcecoul.x += pre2*mui.x - pre1*delx;
            forcecoul.y += pre2*mui.y - pre1*dely;
            forcecoul.z += pre2*mui.z - pre1*delz;
            ticoul.x += pre2 * (mui.y*delz - mui.z*dely);
            ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
            ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
          }
          // charge-dipole
          if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
            r3inv = r2inv*rinv;
            r5inv = r3inv*r2inv;
            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
            pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr;
            pre2 = qtmp*r3inv;
            forcecoul.x += pre1*delx - pre2*muj.x;
            forcecoul.y += pre1*dely - pre2*muj.y;
            forcecoul.z += pre1*delz - pre2*muj.z;
          }
        } else {
          forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
          ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
        }
        numtyp fq = factor_coul*qqrd2e;
        force.x = fq*forcecoul.x + delx*force_lj;
        force.y = fq*forcecoul.y + dely*force_lj;
        force.z = fq*forcecoul.z + delz*force_lj;
        f.x+=force.x;
        f.y+=force.y;
        f.z+=force.z;
        tor.x+=fq*ticoul.x;
        tor.y+=fq*ticoul.y;
        tor.z+=fq*ticoul.z;
        if (eflag>0) {
          acctyp e = (acctyp)0;  
          if (rsq < lj1[mtype].w) {
            e = qtmp*qj*rinv;
            if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
              e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr;
            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) 
              e += -qj*r3inv*pidotr;
            if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
              e += qtmp*r3inv*pjdotr;
            e *= fq;
          } else e = (acctyp)0;
          e_coul += e;
          if (rsq < lj1[mtype].z) {
            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
            energy+=factor_lj*(e-lj3[mtype].z);
          }
        }
        if (vflag>0) {
          virial[0] += delx*force.x;
          virial[1] += dely*force.y;
          virial[2] += delz*force.z;
          virial[3] += delx*force.y;
          virial[4] += delx*force.z;
          virial[5] += dely*force.z;
        }
      }
    } // for nbor
    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                    vflag,ans,engv);
  } // if ii
 }
--- a/lib/gpu/lal_dipole_lj.h
+++ b/lib/gpu/lal_dipole_lj.h
@ -0,0 +1,85 @@
 /***************************************************************************
                                  dipole_lj.h
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the dipole/cut pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifndef LAL_DIPOLE_LJ_H
 #define LAL_DIPOLE_LJ_H
 #include "lal_base_dipole.h"
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class DipoleLJ : public BaseDipole<numtyp, acctyp> {
 public:
  DipoleLJ();
  ~DipoleLJ();
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq, double **host_lj1,
           double **host_lj2, double **host_lj3, double **host_lj4,
           double **host_offset, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen, double **host_cut_ljsq,
           double **host_cut_coulsq, double *host_special_coul,
           const double qqrd2e);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage() const;
  // --------------------------- TYPE DATA --------------------------
  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul
  UCL_D_Vec<numtyp4> lj1;
  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
  UCL_D_Vec<numtyp4> lj3;
  /// cutsq
  UCL_D_Vec<numtyp> cutsq;
  /// Special LJ values [0-3] and Special Coul values [4-7]
  UCL_D_Vec<numtyp> sp_lj;
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
  /// Number of atom types 
  int _lj_types;
  numtyp _qqrd2e;
 private:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
 };
 }
 #endif
--- a/lib/gpu/lal_dipole_lj_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_ext.cpp
@ -0,0 +1,128 @@
 /***************************************************************************
                               dipole_lj_ext.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Functions for LAMMPS access to dipole/cut acceleration routines.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #include <iostream>
 #include <cassert>
 #include <math.h>
 #include "lal_dipole_lj.h"
 using namespace std;
 using namespace LAMMPS_AL;
 static DipoleLJ<PRECISION,ACC_PRECISION> DPLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                 double **host_lj2, double **host_lj3, double **host_lj4,
                 double **offset, double *special_lj, const int inum,
                 const int nall, const int max_nbors, const int maxspecial,
                 const double cell_size, int &gpu_mode, FILE *screen,
                 double **host_cut_ljsq, double **host_cut_coulsq,
                 double *host_special_coul, const double qqrd2e) {
  DPLMF.clear();
  gpu_mode=DPLMF.device->gpu_mode();
  double gpu_split=DPLMF.device->particle_split();
  int first_gpu=DPLMF.device->first_device();
  int last_gpu=DPLMF.device->last_device();
  int world_me=DPLMF.device->world_me();
  int gpu_rank=DPLMF.device->gpu_rank();
  int procs_per_gpu=DPLMF.device->procs_per_gpu();
  DPLMF.device->init_message(screen,"dipole/cut",first_gpu,last_gpu);
  bool message=false;
  if (DPLMF.device->replica_me()==0 && screen)
    message=true;
  if (message) {
    fprintf(screen,"Initializing GPU and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
    init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                       host_lj4, offset, special_lj, inum, nall, 300,
                       maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                       host_cut_coulsq, host_special_coul, qqrd2e);
  DPLMF.device->world_barrier();
  if (message)
    fprintf(screen,"Done.\n");
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
      else
        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                         offset, special_lj, inum, nall, 300, maxspecial,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e);
    DPLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
  if (init_ok==0)
    DPLMF.estimate_gpu_overhead();
  return init_ok;
 }
 void dpl_gpu_clear() {
  DPLMF.clear();
 }
 int** dpl_gpu_compute_n(const int ago, const int inum_full,
                        const int nall, double **host_x, int *host_type,
                        double *sublo, double *subhi, int *tag, int **nspecial, 
                        int **special, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        int **ilist, int **jnum, const double cpu_time,
                        bool &success, double *host_q, double **host_mu, 
                        double *boxlo, double *prd) {
  return DPLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success,
                       host_q, host_mu, boxlo, prd);
 }  
 void dpl_gpu_compute(const int ago, const int inum_full, const int nall,
                     double **host_x, int *host_type, int *ilist, int *numj,
                     int **firstneigh, const bool eflag, const bool vflag,
                     const bool eatom, const bool vatom, int &host_start,
                     const double cpu_time, bool &success, double *host_q,
                     double **host_mu, const int nlocal, double *boxlo, double *prd) {
  DPLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
                vflag,eatom,vatom,host_start,cpu_time,success,host_q,host_mu,
                nlocal,boxlo,prd);
 }
 double dpl_gpu_bytes() {
  return DPLMF.host_memory_usage();
 }
--- a/lib/gpu/lal_dipole_lj_sf.cpp
+++ b/lib/gpu/lal_dipole_lj_sf.cpp
@ -0,0 +1,170 @@
 /***************************************************************************
                                 dipole_lj_sf.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the dipole/sf pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifdef USE_OPENCL
 #include "dipole_lj_sf_cl.h"
 #elif defined(USE_CUDART)
 const char *dipole_lj_sf=0;
 #else
 #include "dipole_lj_sf_cubin.h"
 #endif
 #include "lal_dipole_lj_sf.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define DipoleLJSFT DipoleLJSF<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> device;
 template <class numtyp, class acctyp>
 DipoleLJSFT::DipoleLJSF() : BaseDipole<numtyp,acctyp>(),
                                    _allocated(false) {
 }
 template <class numtyp, class acctyp>
 DipoleLJSFT::~DipoleLJSF() {
  clear();
 }
 template <class numtyp, class acctyp>
 int DipoleLJSFT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }
 template <class numtyp, class acctyp>
 int DipoleLJSFT::init(const int ntypes,
                      double **host_cutsq, double **host_lj1, 
                      double **host_lj2, double **host_lj3, 
                      double **host_lj4,
                      double *host_special_lj, const int nlocal,
                      const int nall, const int max_nbors,
                      const int maxspecial, const double cell_size,
                      const double gpu_split, FILE *_screen,
                      double **host_cut_ljsq, double **host_cut_coulsq,
                      double *host_special_coul, const double qqrd2e) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,dipole_lj_sf,"k_dipole_lj_sf");
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_OPTIMIZED);
  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
                         host_cut_ljsq, host_cut_coulsq);
  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
                         host_cutsq);
  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
    host_write[i]=host_special_lj[i];
    host_write[i+4]=host_special_coul[i];
  }
  ucl_copy(sp_lj,host_write,8,false);
  _qqrd2e=qqrd2e;
  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
                   sp_lj.row_bytes();
  return 0;
 }
 template <class numtyp, class acctyp>
 void DipoleLJSFT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
  lj1.clear();
  lj3.clear();
  cutsq.clear();
  sp_lj.clear();
  this->clear_atomic();
 }
 template <class numtyp, class acctyp>
 double DipoleLJSFT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(DipoleLJSF<numtyp,acctyp>);
 }
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int eflag, vflag;
  if (_eflag)
    eflag=1;
  else
    eflag=0;
  if (_vflag)
    vflag=1;
  else
    vflag=0;
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                          &this->nbor->dev_nbor,
                          &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->atom->q, 
                          &this->atom->quat, &cutsq,
                          &_qqrd2e, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &lj1, &lj3,
                     &_lj_types, &sp_lj, &this->nbor->dev_nbor,
                     &this->_nbor_data->begin(), 
                     &this->ans->force, &this->ans->engv, &eflag, &vflag, 
                     &ainum, &nbor_pitch, &this->atom->q, 
                     &this->atom->quat, &cutsq, 
                     &_qqrd2e, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
 template class DipoleLJSF<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@ -0,0 +1,562 @@
 // **************************************************************************
 //                                dipole_lj_sf.cu
 //                             -------------------
 //                           Trung Dac Nguyen (ORNL)
 //
 //  Device code for acceleration of the dipole/sf pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 #define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid,      \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
  if (t_per_atom>1) {                                                       \
    __local acctyp red_acc[8][BLOCK_PAIR];                                  \
    red_acc[0][tid]=f.x;                                                    \
    red_acc[1][tid]=f.y;                                                    \
    red_acc[2][tid]=f.z;                                                    \
    red_acc[3][tid]=tor.x;                                                  \
    red_acc[4][tid]=tor.y;                                                  \
    red_acc[5][tid]=tor.z;                                                  \
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
      if (offset < s) {                                                     \
        for (int r=0; r<6; r++)                                             \
          red_acc[r][tid] += red_acc[r][tid+s];                             \
      }                                                                     \
    }                                                                       \
    f.x=red_acc[0][tid];                                                    \
    f.y=red_acc[1][tid];                                                    \
    f.z=red_acc[2][tid];                                                    \
    tor.x=red_acc[3][tid];                                                  \
    tor.y=red_acc[4][tid];                                                  \
    tor.z=red_acc[5][tid];                                                  \
    if (eflag>0 || vflag>0) {                                               \
      for (int r=0; r<6; r++)                                               \
        red_acc[r][tid]=virial[r];                                          \
      red_acc[6][tid]=energy;                                               \
      red_acc[7][tid]=ecoul;                                                \
      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
        if (offset < s) {                                                   \
          for (int r=0; r<8; r++)                                           \
            red_acc[r][tid] += red_acc[r][tid+s];                           \
        }                                                                   \
      }                                                                     \
      for (int r=0; r<6; r++)                                               \
        virial[r]=red_acc[r][tid];                                          \
      energy=red_acc[6][tid];                                               \
      ecoul=red_acc[7][tid];                                                \
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
    engv+=ii;                                                               \
    if (eflag>0) {                                                          \
      *engv=energy;                                                         \
      engv+=inum;                                                           \
      *engv=e_coul;                                                         \
      engv+=inum;                                                           \
    }                                                                       \
    if (vflag>0) {                                                          \
      for (int i=0; i<6; i++) {                                             \
        *engv=virial[i];                                                    \
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
    ans[ii]=f;                                                              \
    ans[ii+inum]=tor;                                                       \
  }
 #ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
 texture<float4> mu_tex;
 #else
 texture<int4,1> pos_tex;
 texture<int2> q_tex;
 texture<int4,1> mu_tex;
 #endif
 #else
 #define pos_tex x_
 #define q_tex q_
 #define mu_tex mu_
 #endif
 __kernel void k_dipole_lj_sf(__global numtyp4 *x_, __global numtyp4 *lj1,
                             __global numtyp4* lj3, const int lj_types, 
                             __global numtyp *sp_lj_in, __global int *dev_nbor, 
                             __global int *dev_packed, __global acctyp4 *ans,
                             __global acctyp *engv, const int eflag,
                             const int vflag, const int inum,
                             const int nbor_pitch, __global numtyp *q_ ,
                             __global numtyp4 *mu_,
                             __global numtyp *cutsq, const numtyp qqrd2e,
                             const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp sp_lj[8];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];
  sp_lj[4]=sp_lj_in[4];
  sp_lj[5]=sp_lj_in[5];
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp4 tor;
  tor.x=(acctyp)0;
  tor.y=(acctyp)0;
  tor.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
    int itype=ix.w;
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
      factor_coul = sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      numtyp qj; fetch(qj,j,q_tex);
      numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
      int jtype=jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      int mtype=itype*lj_types+jtype;
      if (rsq<cutsq[mtype]) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp force_lj, r6inv;
        numtyp rinv, r3inv, r5inv;
        numtyp pre1, pre2, pre4;
        numtyp pdotp, pidotr, pjdotr;
        numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
        numtyp4 aforcecoul, bforcecoul;
        acctyp4 forcecoul, ticoul;
        acctyp4 force;
        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
        if (rsq < lj1[mtype].z) {
          r6inv = r2inv*r2inv*r2inv;
          numtyp forceljcut = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
          rcutlj2inv = ucl_recip(lj1[mtype].z);
          rcutlj6inv = rcutlj2inv * rcutlj2inv * rcutlj2inv;
          numtyp forceljsf = rcutlj6inv*(lj1[mtype].x*rcutlj6inv-lj1[mtype].y)*rcutlj2inv;
          force_lj = factor_lj * (forceljcut - forceljsf);
        } else force_lj = (numtyp)0.0;
        if (rsq < lj1[mtype].w) {
          rinv = ucl_rsqrt(rsq);
          rcutcoul2inv = ucl_recip(lj1[mtype].w);
          // charge-charge
          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) { 
            r3inv = r2inv*rinv;
            pre1 = qtmp*qj*rinv*(r2inv-rcutcoul2inv);
            forcecoul.x += pre1*delx;
            forcecoul.y += pre1*dely;
            forcecoul.z += pre1*delz;
          }                    
          // dipole-dipole
          if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
            r3inv = r2inv*rinv; 
            r5inv = r3inv*r2inv;
            pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
            afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv;
            pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr);
            aforcecoul.x = pre1*delx;
            aforcecoul.y = pre1*dely;
            aforcecoul.z = pre1*delz;
            bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+
              (numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv;
            presf = (numtyp)2.0*r2inv*pidotr*pjdotr;
            bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx);
            bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely);
            bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz);
            forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x);
            forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y);
            forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z);
            pre2 = (numtyp)3.0*bfac*r5inv*pjdotr;
            pre4 = -bfac*r3inv;
            numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
            numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
            numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
            ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
            ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
            ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
          }
          // dipole-charge
          if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
            r3inv = r2inv*rinv; 
            r5inv = r3inv*r2inv;
            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
            rcutcoul2inv=ucl_recip(lj1[mtype].w);
            pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv);
            pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + 
              (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
            pre2 = qj*r3inv * pqfac;
            forcecoul.x += pre2*mui.x - pre1*delx;
            forcecoul.y += pre2*mui.y - pre1*dely;
            forcecoul.z += pre2*mui.z - pre1*delz;
            ticoul.x += pre2 * (mui.y*delz - mui.z*dely);
            ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
            ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
          }
          // charge-dipole
          if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
            r3inv = r2inv*rinv;
            r5inv = r3inv*r2inv;
            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
            rcutcoul2inv=ucl_recip(lj1[mtype].w);
            pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv);
            qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + 
              (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
            pre2 = qtmp*r3inv * qpfac;
            forcecoul.x += pre1*delx - pre2*muj.x;
            forcecoul.y += pre1*dely - pre2*muj.y;
            forcecoul.z += pre1*delz - pre2*muj.z;
          }
        } else {
          forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
          ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
        }
        numtyp fq = factor_coul*qqrd2e;
        force.x = fq*forcecoul.x + delx*force_lj;
        force.y = fq*forcecoul.y + dely*force_lj;
        force.z = fq*forcecoul.z + delz*force_lj;
        f.x+=force.x;
        f.y+=force.y;
        f.z+=force.z;
        tor.x+=fq*ticoul.x;
        tor.y+=fq*ticoul.y;
        tor.z+=fq*ticoul.z;
        if (eflag>0) {
          acctyp e = (acctyp)0.0;  
          if (rsq < lj1[mtype].w) {
            numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv);
            e = qtmp*qj*rinv*fac*fac;
            if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
              e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr);
            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) 
              e += -qj*r3inv*pidotr * pqfac;
            if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
              e += qtmp*r3inv*pjdotr * qpfac;
              e *= fq;
          } else e = (acctyp)0.0;
          e_coul += e;
          if (rsq < lj1[mtype].z) {
            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) +
              rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - 
              (numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv +
              rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + 
              (numtyp)4.0*lj3[mtype].y);
            energy+=factor_lj*e;
          } 
        }
        if (vflag>0) {
          virial[0] += delx*force.x;
          virial[1] += dely*force.y;
          virial[2] += delz*force.z;
          virial[3] += delx*force.y;
          virial[4] += delx*force.z;
          virial[5] += dely*force.z;
        }
      }
    } // for nbor
    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                    vflag,ans,engv);
  } // if ii
 }
 __kernel void k_dipole_lj_sf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                  __global numtyp4* lj3_in, 
                                  __global numtyp* sp_lj_in,
                                  __global int *dev_nbor, __global int *dev_packed,
                                  __global acctyp4 *ans, __global acctyp *engv, 
                                  const int eflag, const int vflag, const int inum, 
                                  const int nbor_pitch, __global numtyp *q_,
                                  __global numtyp4 *mu_,
                                  __global numtyp *_cutsq, const numtyp qqrd2e,
                                  const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
  if (tid<8)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    lj1[tid]=lj1_in[tid];
    cutsq[tid]=_cutsq[tid];
    if (eflag>0)
      lj3[tid]=lj3_in[tid];
  }
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp4 tor;
  tor.x=(acctyp)0;
  tor.y=(acctyp)0;
  tor.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
      factor_coul = sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      numtyp qj; fetch(qj,j,q_tex);
      numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
      int mtype=itype+jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      if (rsq<cutsq[mtype]) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp force_lj, r6inv;
        numtyp rinv, r3inv, r5inv;
        numtyp pre1, pre2, pre4;
        numtyp pdotp, pidotr, pjdotr;
        numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
        numtyp4 aforcecoul, bforcecoul;
        acctyp4 forcecoul, ticoul;
        acctyp4 force;
        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
        if (rsq < lj1[mtype].z) {
          r6inv = r2inv*r2inv*r2inv;
          numtyp forceljcut = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
          rcutlj2inv = ucl_recip(lj1[mtype].z);
          rcutlj6inv = rcutlj2inv * rcutlj2inv * rcutlj2inv;
          numtyp forceljsf = rcutlj6inv*(lj1[mtype].x*rcutlj6inv-lj1[mtype].y)*rcutlj2inv;
          force_lj = factor_lj * (forceljcut - forceljsf);
        } else force_lj = (numtyp)0.0;
        if (rsq < lj1[mtype].w) {
          rinv = ucl_rsqrt(rsq);
          rcutcoul2inv = ucl_recip(lj1[mtype].w);
          // charge-charge
          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) { 
            r3inv = r2inv*rinv;
            pre1 = qtmp*qj*rinv*(r2inv-rcutcoul2inv);
            forcecoul.x += pre1*delx;
            forcecoul.y += pre1*dely;
            forcecoul.z += pre1*delz;
          }                    
          // dipole-dipole
          if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
            r3inv = r2inv*rinv; 
            r5inv = r3inv*r2inv;
            pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
            afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv;
            pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr);
            aforcecoul.x = pre1*delx;
            aforcecoul.y = pre1*dely;
            aforcecoul.z = pre1*delz;
            bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+
              (numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv;
            presf = (numtyp)2.0*r2inv*pidotr*pjdotr;
            bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx);
            bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely);
            bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz);
            forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x);
            forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y);
            forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z);
            pre2 = (numtyp)3.0*bfac*r5inv*pjdotr;
            pre4 = -bfac*r3inv;
            numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
            numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
            numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
            ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
            ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
            ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
          }
          // dipole-charge
          if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
            r3inv = r2inv*rinv; 
            r5inv = r3inv*r2inv;
            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
            pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv);
            pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + 
              (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
            pre2 = qj*r3inv * pqfac;
            forcecoul.x += pre2*mui.x - pre1*delx;
            forcecoul.y += pre2*mui.y - pre1*dely;
            forcecoul.z += pre2*mui.z - pre1*delz;
            ticoul.x += pre2 * (mui.y*delz - mui.z*dely);
            ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
            ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
          }
          // charge-dipole
          if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
            r3inv = r2inv*rinv;
            r5inv = r3inv*r2inv;
            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
            pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv);
            qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + 
              (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
            pre2 = qtmp*r3inv * qpfac;
            forcecoul.x += pre1*delx - pre2*muj.x;
            forcecoul.y += pre1*dely - pre2*muj.y;
            forcecoul.z += pre1*delz - pre2*muj.z;
          }
        } else {
          forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
          ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
        }
        numtyp fq = factor_coul*qqrd2e;
        force.x = fq*forcecoul.x + delx*force_lj;
        force.y = fq*forcecoul.y + dely*force_lj;
        force.z = fq*forcecoul.z + delz*force_lj;
        f.x+=force.x;
        f.y+=force.y;
        f.z+=force.z;
        tor.x+=fq*ticoul.x;
        tor.y+=fq*ticoul.y;
        tor.z+=fq*ticoul.z;
        if (eflag>0) {
          acctyp e = (acctyp)0.0;  
          if (rsq < lj1[mtype].w) {
            numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv);
            e = qtmp*qj*rinv*fac*fac;
            if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
              e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr);
            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) 
              e += -qj*r3inv*pidotr * pqfac;
            if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
              e += qtmp*r3inv*pjdotr * qpfac;
            e *= fq;
          } else e = (acctyp)0.0;
          e_coul += e;
          if (rsq < lj1[mtype].z) {
            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) +
              rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - 
              (numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv +
              rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + 
              (numtyp)4.0*lj3[mtype].y);
            energy+=factor_lj*e;
          } 
        }
        if (vflag>0) {
          virial[0] += delx*force.x;
          virial[1] += dely*force.y;
          virial[2] += delz*force.z;
          virial[3] += delx*force.y;
          virial[4] += delx*force.z;
          virial[5] += dely*force.z;
        }
      }
    } // for nbor
    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                    vflag,ans,engv);
  } // if ii
 }
--- a/lib/gpu/lal_dipole_lj_sf.h
+++ b/lib/gpu/lal_dipole_lj_sf.h
@ -0,0 +1,85 @@
 /***************************************************************************
                                  dipole_lj_sf.h
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the dipole/sf pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifndef LAL_DIPOLE_LJ_SF_H
 #define LAL_DIPOLE_LJ_SF_H
 #include "lal_base_dipole.h"
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class DipoleLJSF : public BaseDipole<numtyp, acctyp> {
 public:
  DipoleLJSF();
  ~DipoleLJSF();
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq, double **host_lj1,
           double **host_lj2, double **host_lj3, double **host_lj4,
           double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen, double **host_cut_ljsq,
           double **host_cut_coulsq, double *host_special_coul,
           const double qqrd2e);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage() const;
  // --------------------------- TYPE DATA --------------------------
  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul
  UCL_D_Vec<numtyp4> lj1;
  /// lj3.x = lj3, lj3.y = lj4, lj3.z = cutsq
  UCL_D_Vec<numtyp4> lj3;
  /// cutsq
  UCL_D_Vec<numtyp> cutsq;
  /// Special LJ values [0-3] and Special Coul values [4-7]
  UCL_D_Vec<numtyp> sp_lj;
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
  /// Number of atom types 
  int _lj_types;
  numtyp _qqrd2e;
 private:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
 };
 }
 #endif
--- a/lib/gpu/lal_dipole_lj_sf_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_sf_ext.cpp
@ -0,0 +1,128 @@
 /***************************************************************************
                               dipole_lj_sf_ext.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Functions for LAMMPS access to dipole/sf acceleration routines.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #include <iostream>
 #include <cassert>
 #include <math.h>
 #include "lal_dipole_lj_sf.h"
 using namespace std;
 using namespace LAMMPS_AL;
 static DipoleLJSF<PRECISION,ACC_PRECISION> DPLSFMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                   double **host_lj2, double **host_lj3, double **host_lj4,
                   double *special_lj, const int inum,
                   const int nall, const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen,
                   double **host_cut_ljsq, double **host_cut_coulsq,
                   double *host_special_coul, const double qqrd2e) {
  DPLSFMF.clear();
  gpu_mode=DPLSFMF.device->gpu_mode();
  double gpu_split=DPLSFMF.device->particle_split();
  int first_gpu=DPLSFMF.device->first_device();
  int last_gpu=DPLSFMF.device->last_device();
  int world_me=DPLSFMF.device->world_me();
  int gpu_rank=DPLSFMF.device->gpu_rank();
  int procs_per_gpu=DPLSFMF.device->procs_per_gpu();
  DPLSFMF.device->init_message(screen,"dipole/sf",first_gpu,last_gpu);
  bool message=false;
  if (DPLSFMF.device->replica_me()==0 && screen)
    message=true;
  if (message) {
    fprintf(screen,"Initializing GPU and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
    init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                         host_lj4, special_lj, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e);
  DPLSFMF.device->world_barrier();
  if (message)
    fprintf(screen,"Done.\n");
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
      else
        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                           special_lj, inum, nall, 300, maxspecial,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e);
    DPLSFMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
  if (init_ok==0)
    DPLSFMF.estimate_gpu_overhead();
  return init_ok;
 }
 void dplsf_gpu_clear() {
  DPLSFMF.clear();
 }
 int** dplsf_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
                          bool &success, double *host_q, double **host_mu, 
                          double *boxlo, double *prd) {
  return DPLSFMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, host_mu, boxlo, prd);
 }  
 void dplsf_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success, double *host_q,
                       double **host_mu, const int nlocal, double *boxlo, double *prd) {
  DPLSFMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
                  vflag,eatom,vatom,host_start,cpu_time,success,host_q,host_mu,
                  nlocal,boxlo,prd);
 }
 double dplsf_gpu_bytes() {
  return DPLSFMF.host_memory_usage();
 }
--- a/lib/gpu/lal_gauss.cpp
+++ b/lib/gpu/lal_gauss.cpp
@ -0,0 +1,146 @@
 /***************************************************************************
                                   gauss.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the gauss pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifdef USE_OPENCL
 #include "gauss_cl.h"
 #elif defined(USE_CUDART)
 const char *gauss=0;
 #else
 #include "gauss_cubin.h"
 #endif
 #include "lal_gauss.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define GaussT Gauss<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> device;
 template <class numtyp, class acctyp>
 GaussT::Gauss() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
 }
 template <class numtyp, class acctyp>
 GaussT::~Gauss() { 
  clear();
 }
 template <class numtyp, class acctyp>
 int GaussT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }
 template <class numtyp, class acctyp>
 int GaussT::init(const int ntypes, 
                 double **host_cutsq, double **host_a, 
                 double **host_b, double **host_offset, 
                 double *host_special_lj, const int nlocal,
                 const int nall, const int max_nbors,
                 const int maxspecial, const double cell_size,
                 const double gpu_split, FILE *_screen) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,gauss,"k_gauss");
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_OPTIMIZED);
  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
  gauss1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,gauss1,host_write,host_a,host_b,
 			                   host_cutsq,host_offset);
  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
  dview.view(host_special_lj,4,*(this->ucl_device));
  ucl_copy(sp_lj,dview,false);
  _allocated=true;
  this->_max_bytes=gauss1.row_bytes()+sp_lj.row_bytes();
  return 0;
 }
 template <class numtyp, class acctyp>
 void GaussT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
  gauss1.clear();
  sp_lj.clear();
  this->clear_atomic();
 }
 template <class numtyp, class acctyp>
 double GaussT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(Gauss<numtyp,acctyp>);
 }
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void GaussT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int eflag, vflag;
  if (_eflag)
    eflag=1;
  else
    eflag=0;
  if (_vflag)
    vflag=1;
  else
    vflag=0;
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &gauss1, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &gauss1, &_lj_types, &sp_lj,
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
                     &ainum, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
 template class Gauss<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@ -0,0 +1,189 @@
 // **************************************************************************
 //                                   gauss.cu
 //                             -------------------
 //                           Trung Dac Nguyen (ORNL)
 //
 //  Device code for acceleration of the gauss pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 #else
 texture<int4,1> pos_tex;
 #endif
 #else
 #define pos_tex x_
 #endif
 __kernel void k_gauss(__global numtyp4 *x_, __global numtyp4 *gauss1,
                      const int lj_types, 
                      __global numtyp *sp_lj_in, __global int *dev_nbor, 
                      __global int *dev_packed, __global acctyp4 *ans,
                      __global acctyp *engv, const int eflag, 
                      const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp sp_lj[4];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    numtyp factor_lj;
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      int mtype=itype*lj_types+jtype;
      if (rsq<gauss1[mtype].z) {
        numtyp r2inv = ucl_recip(rsq);
        numtyp r = ucl_sqrt(rsq);
        numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq* 
        ucl_exp(-gauss1[mtype].y*rsq)*r2inv*factor_lj; 
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - 
            gauss1[mtype].w);
          energy+=factor_lj*e; 
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                  ans,engv);
  } // if ii
 }
 __kernel void k_gauss_fast(__global numtyp4 *x_, __global numtyp4 *gauss1_in,
                           __global numtyp* sp_lj_in, 
                           __global int *dev_nbor, __global int *dev_packed, 
                           __global acctyp4 *ans, __global acctyp *engv, 
                           const int eflag, const int vflag, const int inum, 
                           const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp4 gauss1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
  if (tid<4)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    gauss1[tid]=gauss1_in[tid];
  }
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    numtyp factor_lj;
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      if (rsq<gauss1[mtype].z) {
        numtyp r2inv = ucl_recip(rsq);
        numtyp r = ucl_sqrt(rsq);
        numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq* 
        ucl_exp(-gauss1[mtype].y*rsq)*r2inv*factor_lj; 
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - 
            gauss1[mtype].w);
          energy+=factor_lj*e; 
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                  ans,engv);
  } // if ii
 }
--- a/lib/gpu/lal_gauss.h
+++ b/lib/gpu/lal_gauss.h
@ -0,0 +1,77 @@
 /***************************************************************************
                                    gauss.h
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the gauss pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifndef LAL_GAUSS_H
 #define LAL_GAYSS_H
 #include "lal_base_atomic.h"
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class Gauss : public BaseAtomic<numtyp, acctyp> {
 public:
  Gauss();
  ~Gauss(); 
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq,
           double **host_a, double **host_b, double **host_offset, 
           double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size, 
           const double gpu_split, FILE *screen);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage() const;
  // --------------------------- TYPE DATA --------------------------
  /// gauss1.x = a, gauss1.y = b, gauss1.z = cutsq, gauss1.w = offset
  UCL_D_Vec<numtyp4> gauss1;
  /// Special LJ values
  UCL_D_Vec<numtyp> sp_lj;
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
  /// Number of atom types 
  int _lj_types;
 private:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
 };
 }
 #endif
--- a/lib/gpu/lal_gauss_ext.cpp
+++ b/lib/gpu/lal_gauss_ext.cpp
@ -0,0 +1,120 @@
 /***************************************************************************
                                 gauss_ext.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Functions for LAMMPS access to gauss acceleration routines.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #include <iostream>
 #include <cassert>
 #include <math.h>
 #include "lal_gauss.h"
 using namespace std;
 using namespace LAMMPS_AL;
 static Gauss<PRECISION,ACC_PRECISION> GLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, 
                   double **host_b, double **offset, double *special_lj, 
                   const int inum, const int nall, const int max_nbors,  
                   const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen) {
  GLMF.clear();
  gpu_mode=GLMF.device->gpu_mode();
  double gpu_split=GLMF.device->particle_split();
  int first_gpu=GLMF.device->first_device();
  int last_gpu=GLMF.device->last_device();
  int world_me=GLMF.device->world_me();
  int gpu_rank=GLMF.device->gpu_rank();
  int procs_per_gpu=GLMF.device->procs_per_gpu();
  GLMF.device->init_message(screen,"gauss",first_gpu,last_gpu);
  bool message=false;
  if (GLMF.device->replica_me()==0 && screen)
    message=true;
  if (message) {
    fprintf(screen,"Initializing GPU and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
    init_ok=GLMF.init(ntypes, cutsq, host_a, host_b, 
                       offset, special_lj, inum, nall, 300,
                       maxspecial, cell_size, gpu_split, screen);
  GLMF.device->world_barrier();
  if (message)
    fprintf(screen,"Done.\n");
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
      else
        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=GLMF.init(ntypes, cutsq, host_a, host_b,
                        offset, special_lj, inum, nall, 300, maxspecial,
                        cell_size, gpu_split, screen);
    GLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
  if (init_ok==0)
    GLMF.estimate_gpu_overhead();
  return init_ok;
 }
 void gauss_gpu_clear() {
  GLMF.clear();
 }
 int ** gauss_gpu_compute_n(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           double *sublo, double *subhi, int *tag, int **nspecial,
                           int **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
                           bool &success) {
  return GLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                      subhi, tag, nspecial, special, eflag, vflag, eatom,
                      vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 void gauss_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success) {
  GLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
               firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 double gauss_gpu_bytes() {
  return GLMF.host_memory_usage();
 }
--- a/lib/gpu/lal_lj_coul_debye.cpp
+++ b/lib/gpu/lal_lj_coul_debye.cpp
@ -0,0 +1,168 @@
 /***************************************************************************
                              lj_coul_debye.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the lj/cut/coul/debye pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifdef USE_OPENCL
 #include "lj_coul_debye_cl.h"
 #elif defined(USE_CUDART)
 const char *lj_coul_debye=0;
 #else
 #include "lj_coul_debye_cubin.h"
 #endif
 #include "lal_lj_coul_debye.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define LJCoulDebyeT LJCoulDebye<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> device;
 template <class numtyp, class acctyp>
 LJCoulDebyeT::LJCoulDebye() : BaseCharge<numtyp,acctyp>(),
                                    _allocated(false) {
 }
 template <class numtyp, class acctyp>
 LJCoulDebyeT::~LJCoulDebye() {
  clear();
 }
 template <class numtyp, class acctyp>
 int LJCoulDebyeT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }
 template <class numtyp, class acctyp>
 int LJCoulDebyeT::init(const int ntypes,
                       double **host_cutsq, double **host_lj1, 
                       double **host_lj2, double **host_lj3, 
                       double **host_lj4, double **host_offset, 
                       double *host_special_lj, const int nlocal,
                       const int nall, const int max_nbors,
                       const int maxspecial, const double cell_size,
                       const double gpu_split, FILE *_screen,
                       double **host_cut_ljsq, double **host_cut_coulsq,
                       double *host_special_coul, const double qqrd2e,
                       const double kappa) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,lj_coul_debye,"k_lj_debye");
  if (success!=0)
    return success;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_OPTIMIZED);
  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
 			                   host_cut_ljsq, host_cut_coulsq);
  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
 		                     host_offset);
  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
    host_write[i]=host_special_lj[i];
    host_write[i+4]=host_special_coul[i];
  }
  ucl_copy(sp_lj,host_write,8,false);
  _qqrd2e=qqrd2e;
  _kappa=kappa;
  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
                   sp_lj.row_bytes();
  return 0;
 }
 template <class numtyp, class acctyp>
 void LJCoulDebyeT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
  lj1.clear();
  lj3.clear();
  cutsq.clear();
  sp_lj.clear();
  this->clear_atomic();
 }
 template <class numtyp, class acctyp>
 double LJCoulDebyeT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(LJCoulDebye<numtyp,acctyp>);
 }
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int eflag, vflag;
  if (_eflag)
    eflag=1;
  else
    eflag=0;
  if (_vflag)
    vflag=1;
  else
    vflag=0;
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->atom->q, &cutsq,
                          &_qqrd2e, &_kappa, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
                     &ainum, &nbor_pitch, &this->atom->q, &cutsq, 
                     &_qqrd2e, &_kappa, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
 template class LJCoulDebye<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@ -0,0 +1,256 @@
 // **************************************************************************
 //                              lj_coul_debye.cu
 //                             -------------------
 //                           Trung Dac Nguyen (ORNL)
 //
 //  Device code for acceleration of the lj/cut/coul/debye pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
 #else
 texture<int4,1> pos_tex;
 texture<int2> q_tex;
 #endif
 #else
 #define pos_tex x_
 #define q_tex q_
 #endif
 __kernel void k_lj_debye_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
                          __global acctyp *engv, const int eflag,
                          const int vflag, const int inum,
                          const int nbor_pitch, __global numtyp *q_ ,
                          __global numtyp *cutsq, const numtyp qqrd2e,
                          const numtyp kappa,
                          const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp sp_lj[8];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];
  sp_lj[4]=sp_lj_in[4];
  sp_lj[5]=sp_lj_in[5];
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
      factor_coul = sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      int mtype=itype*lj_types+jtype;
      if (rsq<cutsq[mtype]) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp forcecoul, force_lj, force, r6inv, r, rinv, screening;
        if (rsq < lj1[mtype].z) {
          r6inv = r2inv*r2inv*r2inv;
          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
        } else
          force_lj = (numtyp)0.0;
        if (rsq < lj1[mtype].w) {
          r = ucl_sqrt(rsq);
          rinv = ucl_recip(r);
          fetch(screening,j,q_tex);
          screening *= ucl_exp(-kappa*r);
          forcecoul = qqrd2e*qtmp*(kappa+rinv)*screening*factor_coul;
        }
        else
          forcecoul = (numtyp)0.0;
        force = (force_lj + forcecoul) * r2inv;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          if (rsq < lj1[mtype].z) {
            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
            energy+=factor_lj*(e-lj3[mtype].z);
          }
          if (rsq < lj1[mtype].w) {
            e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul;
          } 
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                    vflag,ans,engv);
  } // if ii
 }
 __kernel void k_lj_debye_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
                               __global numtyp* sp_lj_in,
                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
                               const int nbor_pitch, __global numtyp *q_,
                               __global numtyp *_cutsq, const numtyp qqrd2e,
                               const numtyp kappa,
                               const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
  if (tid<8)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    lj1[tid]=lj1_in[tid];
    cutsq[tid]=_cutsq[tid];
    if (eflag>0)
      lj3[tid]=lj3_in[tid];
  }
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul;
      factor_lj = sp_lj[sbmask(j)];
      factor_coul = sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      if (rsq<cutsq[mtype]) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp forcecoul, force_lj, force, r6inv, r, rinv, screening;
        if (rsq < lj1[mtype].z) {
          r6inv = r2inv*r2inv*r2inv;
          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
        } else
          force_lj = (numtyp)0.0;
        if (rsq < lj1[mtype].w) {
          r = ucl_sqrt(rsq);
          rinv = ucl_recip(r);
          fetch(screening,j,q_tex);
          screening *= ucl_exp(-kappa*r);
          forcecoul = qqrd2e*qtmp*(kappa+rinv)*screening*factor_coul;
        }
        else
          forcecoul = (numtyp)0.0;
        force = (force_lj + forcecoul) * r2inv;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          if (rsq < lj1[mtype].z) {
            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
            energy+=factor_lj*(e-lj3[mtype].z);
          }
          if (rsq < lj1[mtype].w) {
            e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul;
          }
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                    vflag,ans,engv);
  } // if ii
 }
--- a/lib/gpu/lal_lj_coul_debye.h
+++ b/lib/gpu/lal_lj_coul_debye.h
@ -0,0 +1,85 @@
 /***************************************************************************
                               lj_coul_debye.h
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the lj/cut/coul/debye pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifndef LAL_LJ_COUL_DEBYE_H
 #define LAL_LJ_COUL_DEBYE_H
 #include "lal_base_charge.h"
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class LJCoulDebye : public BaseCharge<numtyp, acctyp> {
 public:
  LJCoulDebye();
  ~LJCoulDebye();
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq, double **host_lj1,
           double **host_lj2, double **host_lj3, double **host_lj4,
           double **host_offset, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen, double **host_cut_ljsq,
           double **host_cut_coulsq, double *host_special_coul,
           const double qqrd2e, const double kappa);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage() const;
  // --------------------------- TYPE DATA --------------------------
  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul
  UCL_D_Vec<numtyp4> lj1;
  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
  UCL_D_Vec<numtyp4> lj3;
  /// cutsq
  UCL_D_Vec<numtyp> cutsq;
  /// Special LJ values [0-3] and Special Coul values [4-7]
  UCL_D_Vec<numtyp> sp_lj;
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
  /// Number of atom types 
  int _lj_types;
  numtyp _qqrd2e,_kappa;
 private:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
 };
 }
 #endif
--- a/lib/gpu/lal_lj_coul_debye_ext.cpp
+++ b/lib/gpu/lal_lj_coul_debye_ext.cpp
@ -0,0 +1,129 @@
 /***************************************************************************
                             lj_coul_debye_ext.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Functions for LAMMPS access to lj/cut/coul/debye acceleration routines.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #include <iostream>
 #include <cassert>
 #include <math.h>
 #include "lal_lj_coul_debye.h"
 using namespace std;
 using namespace LAMMPS_AL;
 static LJCoulDebye<PRECISION,ACC_PRECISION> LJCDMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  double **host_lj2, double **host_lj3, double **host_lj4,
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors, const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen,
                  double **host_cut_ljsq, double **host_cut_coulsq,
                  double *host_special_coul, const double qqrd2e, 
                  const double kappa) {
  LJCDMF.clear();
  gpu_mode=LJCDMF.device->gpu_mode();
  double gpu_split=LJCDMF.device->particle_split();
  int first_gpu=LJCDMF.device->first_device();
  int last_gpu=LJCDMF.device->last_device();
  int world_me=LJCDMF.device->world_me();
  int gpu_rank=LJCDMF.device->gpu_rank();
  int procs_per_gpu=LJCDMF.device->procs_per_gpu();
  LJCDMF.device->init_message(screen,"lj/cut/coul/debye",first_gpu,last_gpu);
  bool message=false;
  if (LJCDMF.device->replica_me()==0 && screen)
    message=true;
  if (message) {
    fprintf(screen,"Initializing GPU and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
    init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                        host_lj4, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e, kappa);
  LJCDMF.device->world_barrier();
  if (message)
    fprintf(screen,"Done.\n");
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
      else
        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                          offset, special_lj, inum, nall, 300, maxspecial,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e, kappa);
    LJCDMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
  if (init_ok==0)
    LJCDMF.estimate_gpu_overhead();
  return init_ok;
 }
 void ljcd_gpu_clear() {
  LJCDMF.clear();
 }
 int** ljcd_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, int *tag, int **nspecial, 
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success, double *host_q, double *boxlo,
                         double *prd) {
  return LJCDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
 }  
 void ljcd_gpu_compute(const int ago, const int inum_full, const int nall,
                     double **host_x, int *host_type, int *ilist, int *numj,
                     int **firstneigh, const bool eflag, const bool vflag,
                     const bool eatom, const bool vatom, int &host_start,
                     const double cpu_time, bool &success, double *host_q,
                     const int nlocal, double *boxlo, double *prd) {
  LJCDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
                vflag,eatom,vatom,host_start,cpu_time,success,host_q,
                nlocal,boxlo,prd);
 }
 double ljcd_gpu_bytes() {
  return LJCDMF.host_memory_usage();
 }
--- a/lib/gpu/lal_lj_dsf.cpp
+++ b/lib/gpu/lal_lj_dsf.cpp
@ -0,0 +1,168 @@
 /***************************************************************************
                                  lj_dsf.cpp
                             -------------------
                            W. Michael Brown (ORNL)
  Class for acceleration of the lj/cut/coul/dsf pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 7/12/2012
    email                : brownw@ornl.gov
 ***************************************************************************/
 #if defined(USE_OPENCL)
 #include "lj_dsf_cl.h"
 #elif defined(USE_CUDART)
 const char *lj_dsf=0;
 #else
 #include "lj_dsf_cubin.h"
 #endif
 #include "lal_lj_dsf.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define LJDSFT LJDSF<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> device;
 template <class numtyp, class acctyp>
 LJDSFT::LJDSF() : BaseCharge<numtyp,acctyp>(),
                                    _allocated(false) {
 }
 template <class numtyp, class acctyp>
 LJDSFT::~LJDSF() {
  clear();
 }
 template <class numtyp, class acctyp>
 int LJDSFT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }
 template <class numtyp, class acctyp>
 int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1, 
                 double **host_lj2, double **host_lj3,  double **host_lj4,
                 double **host_offset,  double *host_special_lj, 
                 const int nlocal, const int nall, const int max_nbors,
                 const int maxspecial, const double cell_size, 
                 const double gpu_split, FILE *_screen,
                 double **host_cut_ljsq, const double host_cut_coulsq,
                 double *host_special_coul, const double qqrd2e,
                 const double e_shift, const double f_shift, 
                 const double alpha) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,lj_dsf,"k_lj_dsf");
  if (success!=0)
    return success;
  _cut_coulsq=host_cut_coulsq;
  _e_shift=e_shift;
  _f_shift=f_shift;
  _alpha=alpha;
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_OPTIMIZED);
  for (int i=0; i<lj_types*lj_types; i++)
    host_write[i]=0.0;
  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
 			 host_cut_ljsq, host_cutsq);
  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
 		         host_offset);
  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
    host_write[i]=host_special_lj[i];
    host_write[i+4]=host_special_coul[i];
  }
  ucl_copy(sp_lj,host_write,8,false);
  _qqrd2e=qqrd2e;
  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
  return 0;
 }
 template <class numtyp, class acctyp>
 void LJDSFT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
  lj1.clear();
  lj3.clear();
  sp_lj.clear();
  this->clear_atomic();
 }
 template <class numtyp, class acctyp>
 double LJDSFT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(LJDSF<numtyp,acctyp>);
 }
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void LJDSFT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int eflag, vflag;
  if (_eflag)
    eflag=1;
  else
    eflag=0;
  if (_vflag)
    vflag=1;
  else
    vflag=0;
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, 
                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
                          &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
                     &this->ans->force, &this->ans->engv, 
                     &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
                     &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
                     &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
 template class LJDSF<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@ -0,0 +1,261 @@
 // **************************************************************************
 //                                lj_dsf.cu
 //                             -------------------
 //                           W. Michael Brown (ORNL)
 //
 //  Device code for acceleration of the lj/cut/coul/dsf pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 7/12/2012
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> q_tex;
 #else
 texture<int4,1> pos_tex;
 texture<int2> q_tex;
 #endif
 #else
 #define pos_tex x_
 #define q_tex q_
 #endif
 #define MY_PIS (acctyp)1.77245385090551602729
 __kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
                       __global numtyp4* lj3, const int lj_types, 
                       __global numtyp *sp_lj_in, __global int *dev_nbor, 
                       __global int *dev_packed, __global acctyp4 *ans,
                       __global acctyp *engv, const int eflag,
                       const int vflag, const int inum,
                       const int nbor_pitch, __global numtyp *q_ ,
                       const numtyp cut_coulsq, const numtyp qqrd2e,
                       const numtyp e_shift, const numtyp f_shift, 
                       const numtyp alpha, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp sp_lj[8];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];
  sp_lj[4]=sp_lj_in[4];
  sp_lj[5]=sp_lj_in[5];
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul, r, prefactor, erfcc;
      factor_lj = sp_lj[sbmask(j)];
      factor_coul = sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      int mtype=itype*lj_types+jtype;
      if (rsq<lj1[mtype].w) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp forcecoul, force_lj, force, r6inv;
        if (rsq < lj1[mtype].z) {
          r6inv = r2inv*r2inv*r2inv;
          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
        } else
          force_lj = (numtyp)0.0;
        if (rsq < cut_coulsq) {
          r = ucl_sqrt(rsq);
          fetch(prefactor,j,q_tex);
          prefactor *= factor_coul * qqrd2e*qtmp/r;
          numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
          erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
          forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd + 
                                   rsq*f_shift);
        } else
          forcecoul = (numtyp)0.0;
        force = (force_lj + forcecoul) * r2inv;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          if (rsq < cut_coulsq) {
            numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift);
            e_coul += e;
          }
          if (rsq < lj1[mtype].z) {
            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
            energy+=factor_lj*(e-lj3[mtype].z);
          }
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                    vflag,ans,engv);
  } // if ii
 }
 __kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                            __global numtyp4* lj3_in, __global numtyp* sp_lj_in,
                            __global int *dev_nbor, __global int *dev_packed,
                            __global acctyp4 *ans, __global acctyp *engv, 
                            const int eflag, const int vflag, const int inum, 
                            const int nbor_pitch, __global numtyp *q_,
                            const numtyp cut_coulsq, const numtyp qqrd2e,
                            const numtyp e_shift, const numtyp f_shift, 
                            const numtyp alpha, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
  if (tid<8)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    lj1[tid]=lj1_in[tid];
    if (eflag>0)
      lj3[tid]=lj3_in[tid];
  }
  acctyp energy=(acctyp)0;
  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      numtyp factor_lj, factor_coul, r, prefactor, erfcc;
      factor_lj = sp_lj[sbmask(j)];
      factor_coul = sp_lj[sbmask(j)+4];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int mtype=itype+jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      if (rsq<lj1[mtype].w) {
        numtyp r2inv=ucl_recip(rsq);
        numtyp forcecoul, force_lj, force, r6inv;
        if (rsq < lj1[mtype].z) {
          r6inv = r2inv*r2inv*r2inv;
          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
        } else
          force_lj = (numtyp)0.0;
        if (rsq < cut_coulsq) {
          r = ucl_sqrt(rsq);
          fetch(prefactor,j,q_tex);
          prefactor *= factor_coul * qqrd2e*qtmp/r;
          numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
          erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
          forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd + 
                                   rsq*f_shift);
        } else
          forcecoul = (numtyp)0.0;
        force = (force_lj + forcecoul) * r2inv;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          if (rsq < cut_coulsq) {
            numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift);
            e_coul += e;
          }
          if (rsq < lj1[mtype].z) {
            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
            energy+=factor_lj*(e-lj3[mtype].z);
          }
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                    vflag,ans,engv);
  } // if ii
 }
--- a/lib/gpu/lal_lj_dsf.h
+++ b/lib/gpu/lal_lj_dsf.h
@ -0,0 +1,85 @@
 /***************************************************************************
                                  lj_dsf.h
                             -------------------
                            W. Michael Brown (ORNL)
  Class for acceleration of the lj/cut/coul/dsf pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 7/12/2012
    email                : brownw@ornl.gov
 ***************************************************************************/
 #ifndef LAL_LJ_DSF_H
 #define LAL_LJ_DSF_H
 #include "lal_base_charge.h"
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class LJDSF : public BaseCharge<numtyp, acctyp> {
 public:
  LJDSF();
  ~LJDSF();
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq, double **host_lj1,
           double **host_lj2, double **host_lj3, double **host_lj4,
           double **host_offset, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size,
           const double gpu_split, FILE *screen, double **host_cut_ljsq,
           const double host_cut_coulsq, double *host_special_coul,
           const double qqrd2e, const double e_shift, const double f_shift, 
           const double alpha);
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage() const;
  // --------------------------- TYPE DATA --------------------------
  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq
  UCL_D_Vec<numtyp4> lj1;
  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
  UCL_D_Vec<numtyp4> lj3;
  /// Special LJ values [0-3] and Special Coul values [4-7]
  UCL_D_Vec<numtyp> sp_lj;
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
  /// Number of atom types 
  int _lj_types;
  numtyp _qqrd2e;
 private:
  bool _allocated;
  numtyp _e_shift, _f_shift, _alpha, _cut_coulsq;
  void loop(const bool _eflag, const bool _vflag);
 };
 }
 #endif
--- a/lib/gpu/lal_lj_dsf_ext.cpp
+++ b/lib/gpu/lal_lj_dsf_ext.cpp
@ -0,0 +1,132 @@
 /***************************************************************************
                               lj_dsf_ext.cpp
                             -------------------
                            W. Michael Brown (ORNL)
  Functions for LAMMPS access to lj/cut/coul/dsf acceleration routines.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 7/12/2012
    email                : brownw@ornl.gov
 ***************************************************************************/
 #include <iostream>
 #include <cassert>
 #include <math.h>
 #include "lal_lj_dsf.h"
 using namespace std;
 using namespace LAMMPS_AL;
 static LJDSF<PRECISION,ACC_PRECISION> LJDMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                 double **host_lj2, double **host_lj3, double **host_lj4,
                 double **offset, double *special_lj, const int inum,
                 const int nall, const int max_nbors, const int maxspecial,
                 const double cell_size, int &gpu_mode, FILE *screen,
                 double **host_cut_ljsq, const double host_cut_coulsq,
                 double *host_special_coul, const double qqrd2e,
                 const double e_shift, const double f_shift, 
                 const double alpha) {
  LJDMF.clear();
  gpu_mode=LJDMF.device->gpu_mode();
  double gpu_split=LJDMF.device->particle_split();
  int first_gpu=LJDMF.device->first_device();
  int last_gpu=LJDMF.device->last_device();
  int world_me=LJDMF.device->world_me();
  int gpu_rank=LJDMF.device->gpu_rank();
  int procs_per_gpu=LJDMF.device->procs_per_gpu();
  LJDMF.device->init_message(screen,"lj/cut/coul/dsf",first_gpu,last_gpu);
  bool message=false;
  if (LJDMF.device->replica_me()==0 && screen)
    message=true;
  if (message) {
    fprintf(screen,"Initializing GPU and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
    init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                       host_lj4, offset, special_lj, inum, nall, 300,
                       maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                       host_cut_coulsq, host_special_coul, qqrd2e, e_shift,
                       f_shift, alpha);
  LJDMF.device->world_barrier();
  if (message)
    fprintf(screen,"Done.\n");
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
      else
        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                         offset, special_lj, inum, nall, 300, maxspecial,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, e_shift,
                         f_shift, alpha);
    LJDMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
  if (init_ok==0)
    LJDMF.estimate_gpu_overhead();
  return init_ok;
 }
 void ljd_gpu_clear() {
  LJDMF.clear();
 }
 int** ljd_gpu_compute_n(const int ago, const int inum_full,
                        const int nall, double **host_x, int *host_type,
                        double *sublo, double *subhi, int *tag, int **nspecial, 
                        int **special, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        int **ilist, int **jnum, const double cpu_time,
                        bool &success, double *host_q, double *boxlo,
                        double *prd) {
  return LJDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success,
                       host_q, boxlo, prd);
 }  
 void ljd_gpu_compute(const int ago, const int inum_full, const int nall,
                     double **host_x, int *host_type, int *ilist, int *numj,
                     int **firstneigh, const bool eflag, const bool vflag,
                     const bool eatom, const bool vatom, int &host_start,
                     const double cpu_time, bool &success, double *host_q,
                     const int nlocal, double *boxlo, double *prd) {
  LJDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
                vflag,eatom,vatom,host_start,cpu_time,success,host_q,
                nlocal,boxlo,prd);
 }
 double ljd_gpu_bytes() {
  return LJDMF.host_memory_usage();
 }
--- a/lib/gpu/lal_yukawa_colloid.cpp
+++ b/lib/gpu/lal_yukawa_colloid.cpp
@ -0,0 +1,315 @@
 /***************************************************************************
                              yukawa_colloid.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the yukawa/colloid pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifdef USE_OPENCL
 #include "yukawa_colloid_cl.h"
 #elif defined(USE_CUDART)
 const char *yukawa_colloid=0;
 #else
 #include "yukawa_colloid_cubin.h"
 #endif
 #include "lal_yukawa_colloid.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define YukawaColloidT YukawaColloid<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> device;
 template <class numtyp, class acctyp>
 YukawaColloidT::YukawaColloid() : BaseAtomic<numtyp,acctyp>(), 
 _allocated(false), _max_rad_size(0) {
 }
 template <class numtyp, class acctyp>
 YukawaColloidT::~YukawaColloid() { 
  clear();
 }
 template <class numtyp, class acctyp>
 int YukawaColloidT::bytes_per_atom(const int max_nbors) const {
  return this->bytes_per_atom_atomic(max_nbors);
 }
 template <class numtyp, class acctyp>
 int YukawaColloidT::init(const int ntypes, 
                   double **host_cutsq, double **host_a, 
                   double **host_offset, double *host_special_lj, const int nlocal,
                   const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size,
                   const double gpu_split, FILE *_screen, const double kappa) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                            _screen,yukawa_colloid,"k_yukawa_colloid");
  if (success!=0)
    return success;
  // allocate rad
  bool cpuview=false;
  if (this->ucl_device->device_type()==UCL_CPU)
    cpuview=true;
  int ef_nall=nall;
  if (ef_nall==0)
    ef_nall=2000;
  _max_rad_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
  host_rad.alloc(_max_rad_size,*(this->ucl_device));
  if (cpuview)
    dev_rad.view(host_rad);
  else 
    dev_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY);
  rad_tex.get_texture(*(this->pair_program),"rad_tex");
  rad_tex.bind_float(dev_rad,1);
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
  int max_shared_types=this->device->max_shared_types();
  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
  _kappa = kappa;
  // Allocate a host write buffer for data initialization
  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
                               UCL_WRITE_OPTIMIZED);
  for (int i=0; i<lj_types*lj_types*32; i++)
    host_write[i]=(numtyp)0.0;
  coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,
 			 host_offset,host_cutsq);
  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
  dview.view(host_special_lj,4,*(this->ucl_device));
  ucl_copy(sp_lj,dview,false);
  _allocated=true;
  this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes();
  return 0;
 }
 template <class numtyp, class acctyp>
 void YukawaColloidT::clear() {
  if (!_allocated)
    return;
  _allocated=false;
  coeff.clear();
  sp_lj.clear();
  host_rad.clear();
  dev_rad.clear();
  this->clear_atomic();
 }
 template <class numtyp, class acctyp>
 double YukawaColloidT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(YukawaColloid<numtyp,acctyp>);
 }
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then compute atom energies/forces
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void YukawaColloidT::compute(const int f_ago, const int inum_full, 
               const int nall, double **host_x, int *host_type, int *ilist, 
               int *numj, int **firstneigh, const bool eflag, const bool vflag,
               const bool eatom, const bool vatom, int &host_start,
               const double cpu_time, bool &success, double *rad) {
  this->acc_timers();
  // ------------------- Resize rad array --------------------------
  if (nall>_max_rad_size) {
    dev_rad.clear();
    host_rad.clear();
    _max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
    host_rad.alloc(_max_rad_size,*(this->ucl_device));
    if (this->ucl_device->device_type()==UCL_CPU) {
      if (sizeof(numtyp)==sizeof(double)) {
        host_rad.view((numtyp*)rad,nall,*(this->ucl_device));
        dev_rad.view(host_rad);
      } 
    } else {
      dev_rad.alloc(_max_rad_size,*(this->ucl_device));
    }
    rad_tex.bind_float(dev_rad,1);
  }
  // ----------------------------------------------------------------
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
    this->resize_atom(0,nall,success);
    this->zero_timers();
    return;
  }
  int ago=this->hd_balancer.ago_first(f_ago);
  int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
  this->ans->inum(inum);
  host_start=inum;
  // -----------------------------------------------------------------
  if (ago==0) {
    this->reset_nbors(nall, inum, ilist, numj, firstneigh, success);
    if (!success)
      return;
  }
  this->atom->cast_x_data(host_x,host_type);
  this->cast_rad_data(rad);
  this->hd_balancer.start_timer();
  this->atom->add_x_data(host_x,host_type);
  this->add_rad_data();
  this->loop(eflag,vflag);
  this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
  this->device->add_ans_object(this->ans);
  this->hd_balancer.stop_timer();
 }
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU and then compute per-atom densities
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, double *sublo,
                double *subhi, int *tag, int **nspecial,
                int **special, const bool eflag, const bool vflag, 
                const bool eatom, const bool vatom, int &host_start, 
                int **ilist, int **jnum, const double cpu_time, bool &success,
                double *rad) {
  this->acc_timers();
  // ------------------- Resize rad array ----------------------------
  if (nall>_max_rad_size) {
    dev_rad.clear();
    host_rad.clear();
    _max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
    host_rad.alloc(_max_rad_size,*(this->ucl_device));
    if (this->ucl_device->device_type()==UCL_CPU) {
      if (sizeof(numtyp)==sizeof(double)) {
        host_rad.view((numtyp*)rad,nall,*(this->ucl_device));
        dev_rad.view(host_rad);
      } 
    } else {
      dev_rad.alloc(_max_rad_size,*(this->ucl_device));
    }
    rad_tex.bind_float(dev_rad,1);
  }      
  // -----------------------------------------------------------------
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
    this->resize_atom(0,nall,success);
    this->zero_timers();
    return NULL;
  }
  // load balance, returning the atom count on the device (inum)
  this->hd_balancer.balance(cpu_time);
  int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
  this->ans->inum(inum);
  host_start=inum;
  // Build neighbor list on GPU if necessary 
  if (ago==0) {
    this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                          sublo, subhi, tag, nspecial, special, success);
    if (!success)
      return NULL;
    this->cast_rad_data(rad);
    this->hd_balancer.start_timer();  
  } else {
    this->atom->cast_x_data(host_x,host_type);
    this->cast_rad_data(rad);
    this->hd_balancer.start_timer();
    this->atom->add_x_data(host_x,host_type);
  }
  this->add_rad_data();
  *ilist=this->nbor->host_ilist.begin();
  *jnum=this->nbor->host_acc.begin();
  this->loop(eflag,vflag);
  this->ans->copy_answers(eflag,vflag,eatom,vatom);
  this->device->add_ans_object(this->ans);
  this->hd_balancer.stop_timer();
  return this->nbor->host_jlist.begin()-host_start;
 }
 // ---------------------------------------------------------------------------
 // Calculate per-atom energies and forces
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void YukawaColloidT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int eflag, vflag;
  if (_eflag)
    eflag=1;
  else
    eflag=0;
  if (_vflag)
    vflag=1;
  else
    vflag=0;
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->x, &dev_rad, &coeff, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->x, &dev_rad, &coeff, &_lj_types, &sp_lj, 
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
                     &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
  }
  this->time_pair.stop();
 }
 template class YukawaColloid<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@ -0,0 +1,202 @@
 // **************************************************************************
 //                              yukawa_colloid.cu
 //                             -------------------
 //                           Trung Dac Nguyen (ORNL)
 //
 //  Device code for acceleration of the yukawa/colloid pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
 //    email                : nguyentd@ornl.gov
 // ***************************************************************************/
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex;
 texture<float> rad_tex;
 #else
 texture<int4,1> pos_tex;
 texture<int2> rad_tex;
 #endif
 #else
 #define pos_tex x_
 #define rad_tex rad_
 #endif
 __kernel void k_yukawa_colloid(__global numtyp4 *x_, __global numtyp *rad_,
                               __global numtyp4 *coeff, const int lj_types, 
                               __global numtyp *sp_lj_in, __global int *dev_nbor, 
                               __global int *dev_packed, __global acctyp4 *ans,
                               __global acctyp *engv, const int eflag, 
                               const int vflag, const int inum,
                               const int nbor_pitch, const int t_per_atom,
                               const numtyp kappa) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp sp_lj[4];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp radi; fetch(radi,i,rad_tex);
    int itype=ix.w;
    numtyp factor_lj;
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      numtyp radj; fetch(radj,j,rad_tex);
      int jtype=jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      int mtype=itype*lj_types+jtype;
      if (rsq<coeff[mtype].z) {   
        numtyp r = ucl_sqrt(rsq);
        numtyp rinv = ucl_recip(r);
 	      numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
 	      numtyp force = coeff[mtype].x * screening;
 	      force = factor_lj*force * rinv;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          numtyp e=coeff[mtype].x/kappa * screening;
          energy+=factor_lj*(e-coeff[mtype].y); 
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                  ans,engv);
  } // if ii
 }
 __kernel void k_yukawa_colloid_fast(__global numtyp4 *x_, __global numtyp *rad_,
                                    __global numtyp4 *coeff_in, __global numtyp *sp_lj_in,
                                    __global int *dev_nbor, __global int *dev_packed, 
                                    __global acctyp4 *ans, __global acctyp *engv, 
                                    const int eflag, const int vflag, const int inum, 
                                    const int nbor_pitch, const int t_per_atom,
                                    const numtyp kappa) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
  __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
  if (tid<4)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    coeff[tid]=coeff_in[tid];
  }
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
  __syncthreads();
  if (ii<inum) {
    __global int *nbor, *list_end;
    int i, numj, n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,list_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    numtyp radi; fetch(radi,i,rad_tex);
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
    numtyp factor_lj;
    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      numtyp radj; fetch(radj,j,rad_tex);
      int mtype=itype+jx.w;
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
      if (rsq<coeff[mtype].z) {
        numtyp r = ucl_sqrt(rsq);
        numtyp rinv = ucl_recip(r);
 	      numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
 	      numtyp force = coeff[mtype].x * screening;
 	      force = factor_lj*force * rinv;
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
        if (eflag>0) {
          numtyp e=coeff[mtype].x/kappa * screening;
          energy+=factor_lj*(e-coeff[mtype].y); 
        }
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
          virial[3] += delx*dely*force;
          virial[4] += delx*delz*force;
          virial[5] += dely*delz*force;
        }
      }
    } // for nbor
    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                  ans,engv);
  } // if ii
 }
--- a/lib/gpu/lal_yukawa_colloid.h
+++ b/lib/gpu/lal_yukawa_colloid.h
@ -0,0 +1,127 @@
 /***************************************************************************
                              yukawa_colloid.h
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Class for acceleration of the yukawa/colloid pair style.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #ifndef LAL_YUKAWA_COLLOID_H
 #define LAL_YUKAWA_COLLOID_H
 #include "lal_base_atomic.h"
 namespace LAMMPS_AL {
 template <class numtyp, class acctyp>
 class YukawaColloid : public BaseAtomic<numtyp, acctyp> {
 public:
  YukawaColloid();
  ~YukawaColloid(); 
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device
    * 
    * Returns:
    * -  0 if successfull
    * - -1 if fix gpu not found
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, double **host_cutsq,
           double **host_a, double **host_offset, double *host_special_lj,
           const int nlocal, const int nall, const int max_nbors, 
           const int maxspecial, const double cell_size, 
           const double gpu_split, FILE *screen, const double kappa);
  inline void cast_rad_data(double* rad) {
    int nall = this->atom->nall();
    if (this->ucl_device->device_type()==UCL_CPU) {
      if (sizeof(numtyp)==sizeof(double)) {
        host_rad.view((numtyp*)rad,nall,*(this->ucl_device));
        dev_rad.view(host_rad);
      } else {
        for (int i=0; i<nall; i++) host_rad[i]=rad[i];
      }
    } else {
      if (sizeof(numtyp)==sizeof(double))
        memcpy(host_rad.begin(),rad,nall*sizeof(numtyp));
      else {
        for (int i=0; i<nall; i++) host_rad[i]=rad[i];
      }
    }
  }
  // Copy rad to device asynchronously
  inline void add_rad_data() {
    ucl_copy(dev_rad,host_rad,this->atom->nall(),true);
  }
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
  /// Returns memory usage on device per atom
  int bytes_per_atom(const int max_nbors) const;
  /// Total host memory used by library for pair style
  double host_memory_usage() const;
  /// Pair loop with host neighboring
  void compute(const int f_ago, const int inum_full, 
               const int nall, double **host_x, int *host_type, 
               int *ilist, int *numj, int **firstneigh, 
               const bool eflag, const bool vflag,
               const bool eatom, const bool vatom, int &host_start,
               const double cpu_time, bool &success, double *rad);
  /// Pair loop with device neighboring
  int** compute(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, double *sublo,
                double *subhi, int *tag, int **nspecial,
                int **special, const bool eflag, const bool vflag, 
                const bool eatom, const bool vatom, int &host_start, 
                int **ilist, int **jnum, const double cpu_time, 
                bool &success, double *rad);
  // --------------------------- TEXTURES -----------------------------
  UCL_Texture rad_tex;
  // --------------------------- TYPE DATA --------------------------
  /// coeff.x = a, coeff.y = offset, coeff.z = cutsq
  UCL_D_Vec<numtyp4> coeff;
  /// Special LJ values
  UCL_D_Vec<numtyp> sp_lj;
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
  /// Number of atom types 
  int _lj_types;
  int _max_rad_size;
  numtyp _kappa;
  /// Per-atom arrays
  UCL_H_Vec<numtyp> host_rad;
  UCL_D_Vec<numtyp> dev_rad;
 private:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
 };
 }
 #endif
--- a/lib/gpu/lal_yukawa_colloid_ext.cpp
+++ b/lib/gpu/lal_yukawa_colloid_ext.cpp
@ -0,0 +1,123 @@
 /***************************************************************************
                             yukawa_colloid_ext.cpp
                             -------------------
                            Trung Dac Nguyen (ORNL)
  Functions for LAMMPS access to colloid acceleration routines.
 __________________________________________________________________________
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________
    begin                : 
    email                : nguyentd@ornl.gov
 ***************************************************************************/
 #include <iostream>
 #include <cassert>
 #include <math.h>
 #include "lal_yukawa_colloid.h"
 using namespace std;
 using namespace LAMMPS_AL;
 static YukawaColloid<PRECISION,ACC_PRECISION> YKCOLLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, 
                       double **host_offset, double *special_lj, const int inum,
                       const int nall, const int max_nbors,  const int maxspecial,
                       const double cell_size, int &gpu_mode, FILE *screen, 
                       const double kappa) {
  YKCOLLMF.clear();
  gpu_mode=YKCOLLMF.device->gpu_mode();
  double gpu_split=YKCOLLMF.device->particle_split();
  int first_gpu=YKCOLLMF.device->first_device();
  int last_gpu=YKCOLLMF.device->last_device();
  int world_me=YKCOLLMF.device->world_me();
  int gpu_rank=YKCOLLMF.device->gpu_rank();
  int procs_per_gpu=YKCOLLMF.device->procs_per_gpu();
  YKCOLLMF.device->init_message(screen,"yukawa/colloid",first_gpu,last_gpu);
  bool message=false;
  if (YKCOLLMF.device->replica_me()==0 && screen)
    message=true;
  if (message) {
    fprintf(screen,"Initializing GPU and compiling on process 0...");
    fflush(screen);
  }
  int init_ok=0;
  if (world_me==0)
    init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, 
                          inum, nall, 300, maxspecial, cell_size, gpu_split, 
                          screen, kappa);
  YKCOLLMF.device->world_barrier();
  if (message)
    fprintf(screen,"Done.\n");
  for (int i=0; i<procs_per_gpu; i++) {
    if (message) {
      if (last_gpu-first_gpu==0)
        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
      else
        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
                last_gpu,i);
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, 
                            inum, nall, 300, maxspecial, cell_size, gpu_split, 
                            screen, kappa);
    YKCOLLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
  if (init_ok==0)
    YKCOLLMF.estimate_gpu_overhead();
  return init_ok;
 }
 void ykcolloid_gpu_clear() {
  YKCOLLMF.clear();
 }
 int ** ykcolloid_gpu_compute_n(const int ago, const int inum_full,
                               const int nall, double **host_x, int *host_type,
                               double *sublo, double *subhi, int *tag, int **nspecial,
                               int **special, const bool eflag, const bool vflag,
                               const bool eatom, const bool vatom, int &host_start,
                               int **ilist, int **jnum, const double cpu_time,
                               bool &success, double *host_rad) {
  return YKCOLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                          subhi, tag, nspecial, special, eflag, vflag, eatom,
                          vatom, host_start, ilist, jnum, cpu_time, success,
                          host_rad);
 }  
 void ykcolloid_gpu_compute(const int ago, const int inum_full, 
                           const int nall, double **host_x, int *host_type, 
                           int *ilist, int *numj, int **firstneigh, 
                           const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           const double cpu_time, bool &success, double *host_rad) {
  YKCOLLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                   firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,
                   success,host_rad);
 }
 double ykcolloid_gpu_bytes() {
  return YKCOLLMF.host_memory_usage();
 }