diff --git a/lib/gpu/geryon/ucl_matrix.h b/lib/gpu/geryon/ucl_matrix.h
new file mode 100644
index 0000000000..803cd78b4c
--- /dev/null
+++ b/lib/gpu/geryon/ucl_matrix.h
@@ -0,0 +1,224 @@
+/***************************************************************************
+                                 ucl_matrix.h
+                             -------------------
+                               W. Michael Brown
+
+  Matrix Container on Host
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Thu May 10 2012
+    copyright            : (C) 2012 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   This software is distributed under the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+// Only allow this file to be included by CUDA and OpenCL specific headers
+#ifdef _UCL_MAT_ALLOW
+
+/// Matrix S-Object
+template <class hosttype, class devtype>
+class UCL_Matrix {
+ public:
+  // Traits for copying data
+  // MEM_TYPE is 0 for device, 1 for host, and 2 for image
+  enum traits {
+    DATA_TYPE = _UCL_DATA_ID<hosttype>::id,
+    MEM_TYPE = 1,
+    PADDED = 0,
+    ROW_MAJOR = 1,
+    VECTOR = 0
+  };
+  typedef hosttype data_type; 
+
+  /// Host Allocation
+  UCL_H_Mat<hosttype> host;
+  
+  /// Device Allocation
+  UCL_D_Mat<devtype> device;
+
+  UCL_Matrix() { }
+  ~UCL_Matrix() { }
+  
+  /// Construct with specied number of rows and columns
+  /** \sa alloc() **/
+  UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc, 
+             const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+             const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
+  
+  /// Set up host matrix with specied # of rows/cols and reserve memory
+  /** The kind1 parameter controls memory pinning as follows:
+    * - UCL_NOT_PINNED      - Memory is not pinned
+    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
+    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
+    * The kind2 parameter controls memory optimizations as follows:
+    * - UCL_READ_WRITE - Specify that you will read and write in kernels
+    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
+    * - UCL_READ_ONLY  - Specify that you will only read in kernels
+    * \note When passing a command queue instead of a device, the device
+    *       allocation is always performed. Even if the device shares memory
+    *       with the host.
+    * \param cq Default command queue for operations copied from another mat
+    * \return UCL_SUCCESS if the memory allocation is successful **/
+  template <class mat_type>
+  inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
+                   const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
+    { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); }
+  
+  /// Set up host matrix with specied # of rows/cols and reserve memory
+  /** The kind1 parameter controls memory pinning as follows:
+    * - UCL_NOT_PINNED      - Memory is not pinned
+    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
+    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
+    * The kind2 parameter controls memory optimizations as follows:
+    * - UCL_READ_WRITE - Specify that you will read and write in kernels
+    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
+    * - UCL_READ_ONLY  - Specify that you will only read in kernels
+    * \param device Used to get the default command queue for operations
+    * \return UCL_SUCCESS if the memory allocation is successful **/
+  inline int alloc(const size_t rows, const size_t cols, UCL_Device &acc,
+                   const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
+    { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
+  
+  /// Free memory and set size to 0
+  inline void clear() 
+    { host.clear(); device.clear(); }
+
+  /// Resize the allocation to contain cols elements
+  inline int resize(const int rows, const int cols) {
+    assert(host.kind()!=UCL_VIEW);
+    int err=host.resize(rows,cols);
+    if (err!=UCL_SUCCESS)
+      return err;
+    return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+      dev_resize(device,host,_buffer,rows,cols);
+  }
+    
+  /// Resize (only if bigger) the allocation to contain cols elements
+  inline int resize_ib(const int new_rows, const int new_cols)
+    { if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols); 
+      else return UCL_SUCCESS; }
+
+  /// Set each element to zero
+  inline void zero() { host.zero(); device.zero(); }
+  
+  /// Set first n elements to zero
+  inline void zero(const int n) { host.zero(n); device.zero(n); }
+
+  /// Get the number of elements
+  inline size_t numel() const { return host.numel(); }
+  /// Get the number of rows
+  inline size_t rows() const { return host.rows(); }
+  /// Get the number of columns
+  inline size_t cols() const { return host.cols(); }
+  /// Get the memory usage (bytes) of the s-object (including any buffers)
+  inline size_t host_mem_usage() 
+    { return host.row_bytes()*host.rows()+_buffer.row_bytes()*_buffer.rows(); }
+  /// Get the memory usage (bytes) of the s-object (including any buffers)
+  inline size_t device_mem_usage() 
+    { return device.row_bytes()*device.rows(); }
+    
+  /// Get element at index i
+  inline hosttype & operator[](const int i) { return host[i]; }
+  /// Get element at index i
+  inline const hosttype & operator[](const int i) const { return host[i]; }
+  /// 2D access (row should always be 0) 
+  inline hosttype & operator()(const int row, const int col) 
+    { return host(row,col); }
+  /// 2D access (row should always be 0) 
+  inline const hosttype & operator()(const int row, const int col) const
+    { return host(row,col); }
+  
+  /// Returns pointer to memory pointer for allocation on host
+  inline hosttype ** host_ptr() { return host.host_ptr(); }
+  
+  /// Return the default command queue/stream associated with this data
+  inline command_queue & cq() { return host.cq(); }
+  /// Block until command_queue associated with matrix is complete
+  inline void sync() { host.sync(); }
+
+  ///Get the size of a row on the host (including any padding) in elements
+  inline size_t row_size() const { return host.row_size(); }
+  /// Get the size of a row on the host(including any padding) in bytes
+  inline size_t row_bytes() const { return host.row_bytes(); }
+  /// Get the size on the host in bytes of 1 element
+  inline int element_size() const { return sizeof(hosttype); }
+
+
+  /// Update the allocation on the host asynchronously
+  inline void update_host() 
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(host,device,_buffer,true); }
+  /// Update the allocation on the host (true for asynchronous copy)
+  inline void update_host(const bool async)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(host,device,_buffer,async); }
+  /// Update the allocation on the host (using command queue)
+  inline void update_host(command_queue &cq)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(host,device,_buffer,cq); }
+  /// Update the first n elements on the host (true for asynchronous copy)
+  inline void update_host(const int n, const bool async)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(host,device,n,_buffer,async); }
+  /// Update the first n elements on the host (using command queue)
+  inline void update_host(const int n, command_queue &cq)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(host,device,n,_buffer,cq); }
+  /// Update slice on the host (true for asynchronous copy)
+  inline void update_host(const int rows, const int cols, const bool async)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(host,device,rows,cols,_buffer,async); }
+  /// Update slice on the host (using command queue)
+  inline void update_host(const int rows, const int cols, command_queue &cq)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(host,device,rows,cols,_buffer,cq); }
+
+
+  /// Update the allocation on the device asynchronously
+  inline void update_device() 
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(device,host,_buffer,true); }
+  /// Update the allocation on the device (true for asynchronous copy)
+  inline void update_device(const bool async)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(device,host,_buffer,async); }
+  /// Update the allocation on the device (using command queue)
+  inline void update_device(command_queue &cq)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(device,host,_buffer,cq); }
+  /// Update the first n elements on the device (true for asynchronous copy)
+  inline void update_device(const int n, const bool async)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(device,host,n,_buffer,async); }
+  /// Update the first n elements on the device (using command queue)
+  inline void update_device(const int n, command_queue &cq)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(device,host,n,_buffer,cq); }
+  /// Update slice on the device (true for asynchronous copy)
+  inline void update_device(const int rows, const int cols, const bool async)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(device,host,rows,cols,_buffer,async); }
+  /// Update slice on the device (using command queue)
+  inline void update_device(const int rows, const int cols, command_queue &cq)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(device,host,rows,cols,_buffer,cq); }
+
+
+ private:
+  UCL_H_Mat<devtype> _buffer;
+};
+
+#endif
+
diff --git a/lib/gpu/geryon/ucl_s_obj_help.h b/lib/gpu/geryon/ucl_s_obj_help.h
new file mode 100644
index 0000000000..ea772d3608
--- /dev/null
+++ b/lib/gpu/geryon/ucl_s_obj_help.h
@@ -0,0 +1,272 @@
+/***************************************************************************
+                              ucl_s_obj_help.h
+                             -------------------
+                               W. Michael Brown
+
+  Helper routines for allocating memory for s-objects and performing 
+  host/device updates. (Different routines depending on whether the
+  same type is used on the host and device).
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Mon May 14 2012
+    copyright            : (C) 2012 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+    This software is distributed under the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+template <int st> struct _ucl_s_obj_help;
+
+// Host and device containers are same type
+// -- Don't need casting buffers
+// -- Can potentially use same memory if shared by accelerator
+template <> struct _ucl_s_obj_help<1> {
+  template <class t1, class t2, class t3>
+  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
+                          const int cols, UCL_Device &acc,
+                          const enum UCL_MEMOPT kind1,
+                          const enum UCL_MEMOPT kind2) {
+    int e1;
+    e1=host.alloc(cols,acc,kind1);
+    if (e1!=UCL_SUCCESS)
+      return e1;
+    if (acc.shared_memory()) {
+      device.view(host);
+      return UCL_SUCCESS;
+    } else
+      return device.alloc(cols,acc,kind2);
+  }
+
+  template <class t1, class t2, class t3, class mat_type>
+  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
+                          const int cols, mat_type &cq,
+                          const enum UCL_MEMOPT kind1,
+                          const enum UCL_MEMOPT kind2) {
+    int e1;
+    e1=host.alloc(cols,cq,kind1);
+    if (e1!=UCL_SUCCESS)
+      return e1;
+    return device.alloc(cols,cq,kind2);
+  }
+
+  template <class t1, class t2, class t3>
+  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
+                          const int rows, const int cols, UCL_Device &acc,
+                          const enum UCL_MEMOPT kind1,
+                          const enum UCL_MEMOPT kind2) {
+    int e1;
+    e1=host.alloc(rows,cols,acc,kind1);
+    if (e1!=UCL_SUCCESS)
+      return e1;
+    if (acc.shared_memory()) {
+      device.view(host);
+      return UCL_SUCCESS;
+    } else
+      return device.alloc(rows,cols,acc,kind2);
+  }
+
+  template <class t1, class t2, class t3, class mat_type>
+  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
+                          const int rows, const int cols, mat_type &cq,
+                          const enum UCL_MEMOPT kind1,
+                          const enum UCL_MEMOPT kind2) {
+    int e1;
+    e1=host.alloc(rows,cols,cq,kind1);
+    if (e1!=UCL_SUCCESS)
+      return e1;
+    return device.alloc(rows,cols,cq,kind2);
+  }
+
+  template <class t1, class t2, class t3>
+  static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) {
+    ucl_copy(dst,src,async);
+  }
+
+  template <class t1, class t2, class t3>
+  static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) {
+    ucl_copy(dst,src,cq);
+  }
+
+  template <class t1, class t2, class t3>
+  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, 
+                          const bool async) {
+    ucl_copy(dst,src,cols,async);
+  }
+
+  template <class t1, class t2, class t3>
+  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, 
+                          command_queue &cq) {
+    ucl_copy(dst,src,cols,cq);
+  }
+
+  template <class t1, class t2, class t3>
+  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, 
+                          t3 &buffer, const bool async) {
+    ucl_copy(dst,src,rows,cols,async);
+  }
+
+  template <class t1, class t2, class t3>
+  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, 
+                          t3 &buffer, command_queue &cq) {
+    ucl_copy(dst,src,rows,cols,cq);
+  }
+  
+  template <class t1, class t2, class t3>
+  static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
+    if (device.kind()==UCL_VIEW) {
+      device.view(host);
+      return UCL_SUCCESS;
+    } else
+      return device.resize(cols);
+  }
+
+  template <class t1, class t2, class t3>
+  static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows, 
+                               const int cols) {
+    if (device.kind()==UCL_VIEW) {
+      device.view(host);
+      return UCL_SUCCESS;
+    } else
+      return device.resize(rows,cols);
+  }
+};
+
+// Host and device containers are different types
+template <int st> struct _ucl_s_obj_help {
+  template <class t1, class t2, class t3>
+  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
+                          const int cols, UCL_Device &acc,
+                          const enum UCL_MEMOPT kind1,
+                          const enum UCL_MEMOPT kind2) {
+    int e1;
+    e1=host.alloc(cols,acc,UCL_NOT_PINNED);
+    if (e1!=UCL_SUCCESS)
+      return e1;
+    e1=_buffer.alloc(cols,acc,kind1);
+    if (e1!=UCL_SUCCESS)
+      return e1;
+
+    if (acc.shared_memory()) {
+      device.view(_buffer);
+      return UCL_SUCCESS;
+    } else
+      return device.alloc(cols,acc,kind2);
+  }
+
+  template <class t1, class t2, class t3, class mat_type>
+  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
+                          const int cols, mat_type &cq,
+                          const enum UCL_MEMOPT kind1,
+                          const enum UCL_MEMOPT kind2) {
+    int e1;
+    e1=host.alloc(cols,cq,UCL_NOT_PINNED);
+    if (e1!=UCL_SUCCESS)
+      return e1;
+    e1=_buffer.alloc(cols,cq,kind1);
+    if (e1!=UCL_SUCCESS)
+      return e1;
+    return device.alloc(cols,cq,kind2); 
+  }
+
+  template <class t1, class t2, class t3>
+  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
+                          const int rows, const int cols, UCL_Device &acc,
+                          const enum UCL_MEMOPT kind1,
+                          const enum UCL_MEMOPT kind2) {
+    int e1;
+    e1=host.alloc(rows,cols,acc,UCL_NOT_PINNED);
+    if (e1!=UCL_SUCCESS)
+      return e1;
+    e1=_buffer.alloc(rows,cols,acc,kind1);
+    if (e1!=UCL_SUCCESS)
+      return e1;
+
+    if (acc.shared_memory()) {
+      device.view(_buffer);
+      return UCL_SUCCESS;
+    } else
+      return device.alloc(rows,cols,acc,kind2);
+  }
+
+  template <class t1, class t2, class t3, class mat_type>
+  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
+                          const int rows, const int cols, mat_type &cq,
+                          const enum UCL_MEMOPT kind1,
+                          const enum UCL_MEMOPT kind2) {
+    int e1;
+    e1=host.alloc(rows,cols,cq,UCL_NOT_PINNED);
+    if (e1!=UCL_SUCCESS)
+      return e1;
+    e1=_buffer.alloc(rows,cols,cq,kind1);
+    if (e1!=UCL_SUCCESS)
+      return e1;
+    return device.alloc(rows,cols,cq,kind2); 
+  }
+
+  template <class t1, class t2, class t3>
+  static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) {
+    ucl_cast_copy(dst,src,buffer,async);
+  }
+
+  template <class t1, class t2, class t3>
+  static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) {
+    ucl_cast_copy(dst,src,buffer,cq);
+  }
+
+  template <class t1, class t2, class t3>
+  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, 
+                          const bool async) {
+    ucl_cast_copy(dst,src,cols,buffer,async);
+  }
+
+  template <class t1, class t2, class t3>
+  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, 
+                          command_queue &cq) {
+    ucl_cast_copy(dst,src,cols,buffer,cq);
+  }
+  
+  template <class t1, class t2, class t3>
+  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, 
+                          t3 &buffer, const bool async) {
+    ucl_cast_copy(dst,src,rows,cols,buffer,async);
+  }
+
+  template <class t1, class t2, class t3>
+  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, 
+                          t3 &buffer, command_queue &cq) {
+    ucl_cast_copy(dst,src,rows,cols,buffer,cq);
+  }
+
+  template <class t1, class t2, class t3>
+  static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
+    int err=buff.resize(cols);
+    if (err!=UCL_SUCCESS)
+      return err;
+
+    if (device.kind()==UCL_VIEW) {
+      device.view(buff);
+      return UCL_SUCCESS;
+    } else
+      return device.resize(cols);
+  }
+
+  template <class t1, class t2, class t3>
+  static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows, 
+                               const int cols) {
+    int err=buff.resize(rows,cols);
+    if (err!=UCL_SUCCESS)
+      return err;
+
+    if (device.kind()==UCL_VIEW) {
+      device.view(buff);
+      return UCL_SUCCESS;
+    } else
+      return device.resize(rows,cols);
+  }
+
+};
diff --git a/lib/gpu/geryon/ucl_vector.h b/lib/gpu/geryon/ucl_vector.h
new file mode 100644
index 0000000000..01346127c2
--- /dev/null
+++ b/lib/gpu/geryon/ucl_vector.h
@@ -0,0 +1,223 @@
+/***************************************************************************
+                                 ucl_vector.h
+                             -------------------
+                               W. Michael Brown
+
+  Vector Container on Host
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Thu May 10 2012
+    copyright            : (C) 2012 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+    This software is distributed under the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+// Only allow this file to be included by CUDA and OpenCL specific headers
+#ifdef _UCL_MAT_ALLOW
+
+/// Row Vector S-Object
+template <class hosttype, class devtype>
+class UCL_Vector {
+ public:
+  // Traits for copying data
+  // MEM_TYPE is 0 for device, 1 for host, and 2 for image
+  enum traits {
+    DATA_TYPE = _UCL_DATA_ID<hosttype>::id,
+    MEM_TYPE = 1,
+    PADDED = 0,
+    ROW_MAJOR = 1,
+    VECTOR = 1
+  };
+  typedef hosttype data_type; 
+
+  /// Host Allocation
+  UCL_H_Vec<hosttype> host;
+  
+  /// Device Allocation
+  UCL_D_Vec<devtype> device;
+  
+  UCL_Vector() { }
+  ~UCL_Vector() { }
+
+  /// Construct with n columns
+  /** \sa alloc() **/
+  UCL_Vector(const size_t cols, UCL_Device &acc, 
+             const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+             const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        alloc(host,device,_buffer,cols,acc,kind1,kind2); }
+  
+  /// Set up the vector with 'cols' columns and reserve memory
+  /** The kind1 parameter controls memory pinning as follows:
+    * - UCL_NOT_PINNED      - Memory is not pinned
+    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
+    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
+    * The kind2 parameter controls memory optimizations as follows:
+    * - UCL_READ_WRITE - Specify that you will read and write in kernels
+    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
+    * - UCL_READ_ONLY  - Specify that you will only read in kernels
+    * \note When passing a command queue instead of a device, the device
+    *       allocation is always performed. Even if the device shares memory
+    *       with the host.
+    * \param cq Default command queue for operations copied from another mat
+    * \return UCL_SUCCESS if the memory allocation is successful **/
+  template <class mat_type>
+  inline int alloc(const size_t cols, mat_type &cq,
+                   const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
+    { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        alloc(host,device,_buffer,cols,cq,kind1,kind2); }
+
+  /// Set up host vector with 'cols' columns and reserve memory
+  /** The kind1 parameter controls memory pinning as follows:
+    * - UCL_NOT_PINNED      - Memory is not pinned
+    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
+    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
+    * The kind2 parameter controls memory optimizations as follows:
+    * - UCL_READ_WRITE - Specify that you will read and write in kernels
+    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
+    * - UCL_READ_ONLY  - Specify that you will only read in kernels
+    * \param device Used to get the default command queue for operations
+    * \return UCL_SUCCESS if the memory allocation is successful **/
+  inline int alloc(const size_t cols, UCL_Device &acc,
+                   const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
+                   const enum UCL_MEMOPT kind2=UCL_READ_WRITE) 
+    { return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        alloc(host,device,_buffer,cols,acc,kind1,kind2); }
+  
+  /// Free memory and set size to 0
+  inline void clear() 
+    { host.clear(); device.clear(); }
+
+  /// Resize the allocation to contain cols elements
+  inline int resize(const int cols) {
+    assert(host.kind()!=UCL_VIEW);
+    int err=host.resize(cols);
+    if (err!=UCL_SUCCESS)
+      return err;
+    return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+      dev_resize(device,host,_buffer,cols);
+  }
+    
+  /// Resize (only if bigger) the allocation to contain cols elements
+  inline int resize_ib(const int new_cols)
+    { if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; }
+
+  /// Set each element to zero
+  inline void zero() { host.zero(); device.zero(); }
+  
+  /// Set first n elements to zero
+  inline void zero(const int n) { host.zero(n); device.zero(n); }
+
+  /// Get the number of elements
+  inline size_t numel() const { return host.numel(); }
+  /// Get the number of rows
+  inline size_t rows() const { return host.rows(); }
+  /// Get the number of columns
+  inline size_t cols() const { return host.cols(); }
+  /// Get the memory usage (bytes) of the s-object (including any buffers)
+  inline size_t host_mem_usage() 
+    { return host.row_bytes()+_buffer.row_bytes(); }
+  /// Get the memory usage (bytes) of the s-object (including any buffers)
+  inline size_t device_mem_usage() 
+    { return device.row_bytes(); }
+  
+  
+  /// Get element at index i
+  inline hosttype & operator[](const int i) { return host[i]; }
+  /// Get element at index i
+  inline const hosttype & operator[](const int i) const { return host[i]; }
+  /// 2D access (row should always be 0) 
+  inline hosttype & operator()(const int row, const int col) 
+    { return host[col]; }
+  /// 2D access (row should always be 0) 
+  inline const hosttype & operator()(const int row, const int col) const
+    { return host[col]; }
+  
+  /// Returns pointer to memory pointer for allocation on host
+  inline hosttype ** host_ptr() { return host.host_ptr(); }
+  
+  /// Return the default command queue/stream associated with this data
+  inline command_queue & cq() { return host.cq(); }
+  /// Block until command_queue associated with matrix is complete
+  inline void sync() { host.sync(); }
+
+  ///Get the size of a row on the host (including any padding) in elements
+  inline size_t row_size() const { return host.row_size(); }
+  /// Get the size of a row on the host(including any padding) in bytes
+  inline size_t row_bytes() const { return host.row_bytes(); }
+  /// Get the size on the host in bytes of 1 element
+  inline int element_size() const { return sizeof(hosttype); }
+
+
+  /// Update the allocation on the host asynchronously
+  inline void update_host() 
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(host,device,_buffer,true); }
+  /// Update the allocation on the host (true for asynchronous copy)
+  inline void update_host(const bool async)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(host,device,_buffer,async); }
+  /// Update the allocation on the host (using command queue)
+  inline void update_host(command_queue &cq)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(host,device,_buffer,cq); }
+  /// Update the first n elements on the host (true for asynchronous copy)
+  inline void update_host(const int n, const bool async)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(host,device,n,_buffer,async); }
+  /// Update the first n elements on the host (using command queue)
+  inline void update_host(const int n, command_queue &cq)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(host,device,n,_buffer,cq); }
+  /// Update slice on the host (true for asynchronous copy)
+  inline void update_host(const int rows, const int cols, const bool async)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(host,device,rows,cols,_buffer,async); }
+  /// Update slice on the host (using command queue)
+  inline void update_host(const int rows, const int cols, command_queue &cq)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(host,device,rows,cols,_buffer,cq); }
+
+
+  /// Update the allocation on the device asynchronously
+  inline void update_device() 
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(device,host,_buffer,true); }
+  /// Update the allocation on the device (true for asynchronous copy)
+  inline void update_device(const bool async)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(device,host,_buffer,async); }
+  /// Update the allocation on the device (using command queue)
+  inline void update_device(command_queue &cq)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(device,host,_buffer,cq); }
+  /// Update the first n elements on the device (true for asynchronous copy)
+  inline void update_device(const int n, const bool async)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(device,host,n,_buffer,async); }
+  /// Update the first n elements on the device (using command queue)
+  inline void update_device(const int n, command_queue &cq)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(device,host,n,_buffer,cq); }
+  /// Update slice on the device (true for asynchronous copy)
+  inline void update_device(const int rows, const int cols, const bool async)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(device,host,rows,cols,_buffer,async); }
+  /// Update slice on the device (using command queue)
+  inline void update_device(const int rows, const int cols, command_queue &cq)
+    { _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
+        copy(device,host,rows,cols,_buffer,cq); }
+
+ private:
+  UCL_H_Vec<devtype> _buffer;
+};
+
+#endif
+
diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp
new file mode 100644
index 0000000000..7c090e6351
--- /dev/null
+++ b/lib/gpu/lal_base_dipole.cpp
@@ -0,0 +1,315 @@
+/***************************************************************************
+                               base_dipole.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Base class for pair styles needing per-particle data for position,
+  dipole, and type.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#include "lal_base_dipole.h"
+using namespace LAMMPS_AL;
+#define BaseDipoleT BaseDipole<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> global_device;
+
+template <class numtyp, class acctyp>
+BaseDipoleT::BaseDipole() : _compiled(false), _max_bytes(0) {
+  device=&global_device;
+  ans=new Answer<numtyp,acctyp>();
+  nbor=new Neighbor();
+}
+
+template <class numtyp, class acctyp>
+BaseDipoleT::~BaseDipole() {
+  delete ans;
+  delete nbor;
+}
+
+template <class numtyp, class acctyp>
+int BaseDipoleT::bytes_per_atom_atomic(const int max_nbors) const {
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int BaseDipoleT::init_atomic(const int nlocal, const int nall,
+                             const int max_nbors, const int maxspecial,
+                             const double cell_size,
+                             const double gpu_split, FILE *_screen,
+                             const void *pair_program,
+                             const char *k_name) {
+  screen=_screen;
+
+  int gpu_nbor=0;
+  if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+
+  int _gpu_host=0;
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
+  if (host_nlocal>0)
+    _gpu_host=1;
+
+  _threads_per_atom=device->threads_per_charge();
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+    
+  int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor,
+                           maxspecial,_gpu_host,max_nbors,cell_size,false,
+                           _threads_per_atom);
+  if (success!=0)
+    return success;
+
+  ucl_device=device->gpu;
+  atom=&device->atom;
+
+  _block_size=device->pair_block_size();
+  _block_bio_size=device->block_bio_pair();
+  compile_kernels(*ucl_device,pair_program,k_name);
+
+  // Initialize host-device load balancer
+  hd_balancer.init(device,gpu_nbor,gpu_split);
+
+  // Initialize timers for the selected GPU
+  time_pair.init(*ucl_device);
+  time_pair.zero();
+
+  pos_tex.bind_float(atom->x,4);
+  q_tex.bind_float(atom->q,1);
+  mu_tex.bind_float(atom->quat,4);
+
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+
+  return success;
+}
+
+template <class numtyp, class acctyp>
+void BaseDipoleT::estimate_gpu_overhead() {
+  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
+}
+
+template <class numtyp, class acctyp>
+void BaseDipoleT::clear_atomic() {
+  // Output any timing information
+  acc_timers();
+  double avg_split=hd_balancer.all_avg_split();
+  _gpu_overhead*=hd_balancer.timestep();
+  _driver_overhead*=hd_balancer.timestep();
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
+                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
+
+  if (_compiled) {
+    k_pair_fast.clear();
+    k_pair.clear();
+    delete pair_program;
+    _compiled=false;
+  }
+
+  time_pair.clear();
+  hd_balancer.clear();
+
+  nbor->clear();
+  ans->clear();
+  device->clear();
+}
+
+// ---------------------------------------------------------------------------
+// Copy neighbor list from host
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int * BaseDipoleT::reset_nbors(const int nall, const int inum, int *ilist,
+                                   int *numj, int **firstneigh, bool &success) {
+  success=true;
+
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
+  resize_atom(inum,nall,success);
+  resize_local(inum,mn,success);
+  if (!success)
+    return false;
+
+  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+
+  return ilist;
+}
+
+// ---------------------------------------------------------------------------
+// Build neighbor list on device
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum,
+                                         const int nall, double **host_x,
+                                         int *host_type, double *sublo,
+                                         double *subhi, int *tag, 
+                                         int **nspecial, int **special,
+                                         bool &success) {
+  success=true;
+  resize_atom(inum,nall,success);
+  resize_local(inum,host_inum,nbor->max_nbors(),success);
+  if (!success)
+    return;
+  atom->cast_copy_x(host_x,host_type);
+
+  int mn;
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
+                        nspecial, special, success, mn);
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+}
+
+// ---------------------------------------------------------------------------
+// Copy nbor list from host if necessary and then calculate forces, virials,..
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseDipoleT::compute(const int f_ago, const int inum_full,
+                          const int nall, double **host_x, int *host_type,
+                          int *ilist, int *numj, int **firstneigh,
+                          const bool eflag, const bool vflag,
+                          const bool eatom, const bool vatom,
+                          int &host_start, const double cpu_time,
+                          bool &success, double *host_q, double **host_mu,
+                          const int nlocal, double *boxlo, double *prd) {
+  acc_timers();
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return;
+  }
+  
+  int ago=hd_balancer.ago_first(f_ago);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
+  ans->inum(inum);
+  host_start=inum;
+
+  if (ago==0) {
+    reset_nbors(nall, inum, ilist, numj, firstneigh, success);
+    if (!success)
+      return;
+  }
+
+  atom->cast_x_data(host_x,host_type);
+  atom->cast_q_data(host_q);
+  atom->cast_quat_data(host_mu[0]);
+  hd_balancer.start_timer();
+  atom->add_x_data(host_x,host_type);
+  atom->add_q_data();
+  atom->add_quat_data();
+
+  device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
+                     boxlo, prd);
+
+  loop(eflag,vflag);
+  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary and then compute forces, virials, energies
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** BaseDipoleT::compute(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           double *sublo, double *subhi, int *tag,
+                           int **nspecial, int **special, const bool eflag, 
+                           const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum,
+                           const double cpu_time, bool &success,
+                           double *host_q, double **host_mu, 
+                           double *boxlo, double *prd) {
+  acc_timers();
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return NULL;
+  }
+  
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
+  host_start=inum;
+ 
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    sublo, subhi, tag, nspecial, special, success);
+    if (!success)
+      return NULL;
+    atom->cast_q_data(host_q);
+    atom->cast_quat_data(host_mu[0]);
+    hd_balancer.start_timer();
+  } else {
+    atom->cast_x_data(host_x,host_type);
+    atom->cast_q_data(host_q);
+    atom->cast_quat_data(host_mu[0]);
+    hd_balancer.start_timer();
+    atom->add_x_data(host_x,host_type);
+  }
+  atom->add_q_data();
+  atom->add_quat_data();
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
+                     boxlo, prd);
+
+  loop(eflag,vflag);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+  
+  return nbor->host_jlist.begin()-host_start;
+}
+
+template <class numtyp, class acctyp>
+double BaseDipoleT::host_memory_usage_atomic() const {
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(BaseDipole<numtyp,acctyp>);
+}
+
+template <class numtyp, class acctyp>
+void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str,
+                                  const char *kname) {
+  if (_compiled)
+    return;
+
+  std::string s_fast=std::string(kname)+"_fast";
+  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
+                    std::string(OCL_PRECISION_COMPILE)+" -D"+
+                    std::string(OCL_VENDOR);
+
+  pair_program=new UCL_Program(dev);
+  pair_program->load_string(pair_str,flags.c_str());
+  k_pair_fast.set_function(*pair_program,s_fast.c_str());
+  k_pair.set_function(*pair_program,kname);
+  pos_tex.get_texture(*pair_program,"pos_tex");
+  q_tex.get_texture(*pair_program,"q_tex");
+  mu_tex.get_texture(*pair_program,"mu_tex");
+
+  _compiled=true;
+}
+
+template class BaseDipole<PRECISION,ACC_PRECISION>;
+
diff --git a/lib/gpu/lal_base_dipole.h b/lib/gpu/lal_base_dipole.h
new file mode 100644
index 0000000000..51e357afda
--- /dev/null
+++ b/lib/gpu/lal_base_dipole.h
@@ -0,0 +1,200 @@
+/***************************************************************************
+                                base_dipole.h
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Base class for pair styles needing per-particle data for position,
+  dipole, and type.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_BASE_DIPOLE_H
+#define LAL_BASE_DIPOLE_H
+
+#include "lal_device.h"
+#include "lal_balance.h"
+#include "mpi.h"
+
+#ifdef USE_OPENCL
+#include "geryon/ocl_texture.h"
+#else
+#include "geryon/nvd_texture.h"
+#endif
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class BaseDipole {
+ public:
+  BaseDipole();
+  virtual ~BaseDipole();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * \param k_name name for the kernel for force calculation
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const double cell_size,
+                  const double gpu_split, FILE *screen,
+                  const void *pair_program, const char *k_name);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead();
+
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int inum, const int nall, bool &success) {
+    if (atom->resize(nall, success)) {
+      pos_tex.bind_float(atom->x,4);
+      q_tex.bind_float(atom->q,1);
+      mu_tex.bind_float(atom->quat,4);
+    }
+    ans->resize(inum,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note olist_size=total number of local particles **/
+  inline void resize_local(const int inum, const int max_nbors, bool &success) {
+    nbor->resize(inum,max_nbors,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note host_inum is 0 if the host is performing neighboring
+    * \note nlocal+host_inum=total number local particles
+    * \note olist_size=0 **/
+  inline void resize_local(const int inum, const int host_inum, 
+                           const int max_nbors, bool &success) {
+    nbor->resize(inum,host_inum,max_nbors,success);
+  }
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear_atomic();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom_atomic(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage_atomic() const;
+
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (device->time_device()) {
+      nbor->acc_timers();
+      time_pair.add_to_total();
+      atom->acc_timers();
+      ans->acc_timers();
+    }
+  }
+
+  /// Zero timers
+  inline void zero_timers() {
+    time_pair.zero();
+    atom->zero_timers();
+    ans->zero_timers();
+  }
+
+  /// Copy neighbor list from host
+  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
+                    int **firstneigh, bool &success);
+
+  /// Build neighbor list on device
+  void build_nbor_list(const int inum, const int host_inum,
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, int *tag, int **nspecial,
+                       int **special, bool &success);
+
+  /// Pair loop with host neighboring
+  void compute(const int f_ago, const int inum_full, const int nall,
+               double **host_x, int *host_type, int *ilist, int *numj,
+               int **firstneigh, const bool eflag, const bool vflag,
+               const bool eatom, const bool vatom, int &host_start,
+               const double cpu_time, bool &success, double *charge,
+               double **mu, const int nlocal, double *boxlo, double *prd);
+
+  /// Pair loop with device neighboring
+  int** compute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, double *sublo,
+                double *subhi, int *tag, int **nspecial,
+                int **special, const bool eflag, const bool vflag, 
+                const bool eatom, const bool vatom, int &host_start, 
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double *charge, double **mu, double *boxlo, double *prd);
+
+  // -------------------------- DEVICE DATA ------------------------- 
+
+  /// Device Properties and Atom and Neighbor storage
+  Device<numtyp,acctyp> *device;
+
+  /// Geryon device
+  UCL_Device *ucl_device;
+
+  /// Device Timers
+  UCL_Timer time_pair;
+
+  /// Host device load balancer
+  Balance<numtyp,acctyp> hd_balancer;
+
+  /// LAMMPS pointer for screen output
+  FILE *screen;
+
+  // --------------------------- ATOM DATA --------------------------
+
+  /// Atom Data
+  Atom<numtyp,acctyp> *atom;
+
+
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  Answer<numtyp,acctyp> *ans;
+
+  // --------------------------- NBOR DATA ----------------------------
+
+  /// Neighbor data
+  Neighbor *nbor;
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *pair_program;
+  UCL_Kernel k_pair_fast, k_pair;
+  inline int block_size() { return _block_size; }
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture pos_tex;
+  UCL_Texture q_tex;
+  UCL_Texture mu_tex;
+
+ protected:
+  bool _compiled;
+  int _block_size, _block_bio_size, _threads_per_atom;
+  double  _max_bytes, _max_an_bytes;
+  double _gpu_overhead, _driver_overhead;
+  UCL_D_Vec<int> *_nbor_data;
+
+  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
+
+  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_born.cpp b/lib/gpu/lal_born.cpp
new file mode 100644
index 0000000000..1665717c2d
--- /dev/null
+++ b/lib/gpu/lal_born.cpp
@@ -0,0 +1,162 @@
+/***************************************************************************
+                                  born.cpp
+                             -------------------
+                           Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the born pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "born_cl.h"
+#elif defined(USE_CUDART)
+const char *born=0;
+#else
+#include "born_cubin.h"
+#endif
+
+#include "lal_born.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define BornT Born<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+BornT::Born() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+BornT::~Born() { 
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int BornT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int BornT::init(const int ntypes, double **host_cutsq,
+                double **host_rhoinv, double **host_born1, double **host_born2, 
+                double **host_born3, double **host_a, double **host_c,
+                double **host_d, double **host_sigma,
+                double **host_offset, double *host_special_lj,
+                const int nlocal, const int nall, const int max_nbors, 
+                const int maxspecial, const double cell_size, 
+                const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,born,"k_born");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv,
+                         host_born1,host_born2,host_born3);
+
+  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
+		                     host_d,host_offset);
+
+  cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack2(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
+                         host_sigma);
+
+  UCL_H_Vec<double> dview;
+  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  dview.view(host_special_lj,4,*(this->ucl_device));
+  ucl_copy(sp_lj,dview,false);
+
+  _allocated=true;
+  this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()
+   +cutsq_sigma.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void BornT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  coeff1.clear();
+  coeff2.clear();
+  cutsq_sigma.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double BornT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(Born<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BornT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->x, &coeff1,&coeff2,
+                          &cutsq_sigma, &sp_lj,
+                          &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &coeff1, &coeff2,
+                     &cutsq_sigma, &_lj_types, &sp_lj, 
+                     &this->nbor->dev_nbor,
+                     &this->_nbor_data->begin(), &this->ans->force,
+                     &this->ans->engv, &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class Born<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_born.cu b/lib/gpu/lal_born.cu
new file mode 100644
index 0000000000..3410a5effd
--- /dev/null
+++ b/lib/gpu/lal_born.cu
@@ -0,0 +1,201 @@
+// **************************************************************************
+//                                   born.cu
+//                             -------------------
+//                           Trung Dac Nguyen (ORNL)
+//
+//  Device code for acceleration of the born pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : nguyentd@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
+#endif
+#else
+#define pos_tex x_
+#endif
+
+__kernel void k_born(__global numtyp4 *x_, __global numtyp4 *coeff1,
+                          __global numtyp4* coeff2, 
+                          __global numtyp2 *cutsq_sigma,
+                          const int lj_types, __global numtyp *sp_lj_in, 
+                          __global int *dev_nbor, __global int *dev_packed, 
+                          __global acctyp4 *ans, __global acctyp *engv, 
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      int mtype=itype*lj_types+jtype;
+      if (r2inv<cutsq_sigma[mtype].x) {
+        numtyp r=ucl_sqrt(r2inv);
+        numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
+        r2inv=ucl_recip(r2inv);
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
+                - coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
+        force*=factor_lj;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+            + coeff2[mtype].z*r2inv*r6inv;
+          energy+=factor_lj*(e-coeff2[mtype].w); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
+__kernel void k_born_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
+                               __global numtyp4* coeff2_in, 
+                               __global numtyp2 *cutsq_sigma,
+                               __global numtyp* sp_lj_in, 
+                               __global int *dev_nbor, __global int *dev_packed, 
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nbor_pitch, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+  
+  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[4];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    coeff1[tid]=coeff1_in[tid];
+    if (eflag>0)
+      coeff2[tid]=coeff2_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<cutsq_sigma[mtype].x) {
+        numtyp r=ucl_sqrt(r2inv);
+        numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
+        r2inv=ucl_recip(r2inv);
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = r2inv*(coeff1[mtype].y*r*rexp 
+          - coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
+        force*=factor_lj;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+            + coeff2[mtype].z*r2inv*r6inv;
+          energy+=factor_lj*(e-coeff2[mtype].w);  
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
diff --git a/lib/gpu/lal_born.h b/lib/gpu/lal_born.h
new file mode 100644
index 0000000000..fe9c2a1819
--- /dev/null
+++ b/lib/gpu/lal_born.h
@@ -0,0 +1,84 @@
+/***************************************************************************
+                                   born.h
+                             -------------------
+                           Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the born pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_BORN_H
+#define LAL_BORN_H
+
+#include "lal_base_atomic.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class Born : public BaseAtomic<numtyp, acctyp> {
+ public:
+  Born();
+  ~Born(); 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_rhoinv, double **host_born1, double **host_born2, 
+           double **host_born3, double **host_a, double **host_c,
+           double **host_d, double **host_sigma, 
+           double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
+  /// coeff1.w = born3
+  UCL_D_Vec<numtyp4> coeff1;
+  /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
+  UCL_D_Vec<numtyp4> coeff2;
+  /// cutsq_sigma
+  UCL_D_Vec<numtyp2> cutsq_sigma;
+  /// Special LJ values
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_born_coul_long.cpp b/lib/gpu/lal_born_coul_long.cpp
new file mode 100644
index 0000000000..157923d4fa
--- /dev/null
+++ b/lib/gpu/lal_born_coul_long.cpp
@@ -0,0 +1,175 @@
+/***************************************************************************
+                             born_coul_long.cpp
+                             -------------------
+                           Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the born/coul/long pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "born_coul_long_cl.h"
+#elif defined(USE_CUDART)
+const char *born_coul_long=0;
+#else
+#include "born_coul_long_cubin.h"
+#endif
+
+#include "lal_born_coul_long.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define BornCoulLongT BornCoulLong<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+BornCoulLongT::BornCoulLong() : BaseCharge<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+BornCoulLongT::~BornCoulLongT() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int BornCoulLongT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
+                       double **host_born1, double **host_born2, double **host_born3, 
+                       double **host_a, double **host_c, double **host_d, 
+                       double **host_sigma, double **host_offset, 
+                       double *host_special_lj, const int nlocal,
+                       const int nall, const int max_nbors,
+                       const int maxspecial, const double cell_size,
+                       const double gpu_split, FILE *_screen,
+                       double **host_cut_ljsq, const double host_cut_coulsq,
+                       double *host_special_coul, const double qqrd2e,
+                       const double g_ewald) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,born_coul_long,"k_born_long");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv,
+                         host_born1,host_born2,host_born3);
+
+  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
+		         host_d,host_offset);
+  
+  cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
+             host_cut_ljsq,host_sigma);
+  
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _cut_coulsq=host_cut_coulsq;
+  _qqrd2e=qqrd2e;
+  _g_ewald=g_ewald;
+
+  _allocated=true;
+  this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()
+      +cutsq_sigma.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void BornCoulLongT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  coeff1.clear();
+  coeff2.clear();
+  cutsq_sigma.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double BornCoulLongT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(BornCoulLong<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+                          &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->ans->force,
+                          &this->ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->atom->q,
+                          &cutsq_sigma, &_cut_coulsq, &_qqrd2e, 
+                          &_g_ewald, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, 
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->ans->force, &this->ans->engv, 
+                   &eflag, &vflag, &ainum,
+                   &nbor_pitch, &this->atom->q, 
+                   &cutsq_sigma, &_cut_coulsq,
+                   &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class BornCoulLong<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_born_coul_long.cu b/lib/gpu/lal_born_coul_long.cu
new file mode 100644
index 0000000000..601f7341cf
--- /dev/null
+++ b/lib/gpu/lal_born_coul_long.cu
@@ -0,0 +1,262 @@
+// **************************************************************************
+//                              buck_coul_long.cu
+//                             -------------------
+//                           Trung Dac Nguyen (ORNL)
+//
+//  Device code for acceleration of the buck/coul/long pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : nguyentd@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_born_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
+                          __global numtyp4* coeff2, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag, 
+                          const int vflag, const int inum,
+                          const int nbor_pitch, __global numtyp *q_,
+                          __global numtyp4 *cutsq_sigma, 
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int itype=ix.w;
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<cutsq_sigma[mtype].x) { // cutsq
+        numtyp r2inv = ucl_recip(rsq);
+        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
+        numtyp rexp = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else forcecoul = (numtyp)0.0;
+
+        if (rsq < cutsq_sigma[mtype].y) {
+          numtyp r = ucl_sqrt(rsq);
+          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
+          r6inv = r2inv*r2inv*r2inv;
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
+            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
+        } else forceborn = (numtyp)0.0;
+
+        force = (forceborn + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < coeff1[mtype].w) {
+            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+              + coeff2[mtype].z*r2inv*r6inv;
+            energy+=factor_lj*(e-coeff2[mtype].w);
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_born_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
+                               __global numtyp4* coeff2_in, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nbor_pitch, __global numtyp *q_,
+                               __global numtyp4 *cutsq_sigma,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    coeff1[tid]=coeff1_in[tid];
+    if (eflag>0)
+      coeff2[tid]=coeff2_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq_sigma[mtype].x) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
+        numtyp rexp = (numtyp)0.0;
+        
+        if (rsq < cut_coulsq) {
+          numtyp r=ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else forcecoul = (numtyp)0.0;
+
+        if (rsq < cutsq_sigma[mtype].y) {
+          numtyp r = ucl_sqrt(rsq);
+          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
+          r6inv = r2inv*r2inv*r2inv;
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
+            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
+        } else forceborn = (numtyp)0.0;
+
+        force = (forceborn + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < coeff1[mtype].w) {
+            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+              + coeff2[mtype].z*r2inv*r6inv;
+            energy+=factor_lj*(e-coeff2[mtype].w);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
diff --git a/lib/gpu/lal_born_coul_long.h b/lib/gpu/lal_born_coul_long.h
new file mode 100644
index 0000000000..4dc5021f03
--- /dev/null
+++ b/lib/gpu/lal_born_coul_long.h
@@ -0,0 +1,88 @@
+/***************************************************************************
+                              born_coul_long.h
+                             -------------------
+                           Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the born/coul/long pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_BORN_COUL_LONG_H
+#define LAL_BORN_COUL_LONG_H
+
+#include "lal_base_charge.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class BornCoulLong : public BaseCharge<numtyp, acctyp> {
+ public:
+  BornCoulLong();
+  ~BornCoulLong();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
+           double **host_born1, double **host_born2, double **host_born3, 
+           double **host_a, double **host_c, double **host_d, 
+           double **host_sigma, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, 
+  /// coeff1.w = born3
+  UCL_D_Vec<numtyp4> coeff1;
+  /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
+  UCL_D_Vec<numtyp4> coeff2;
+  /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, 
+  /// cutsq_sigma.z = sigma
+  UCL_D_Vec<numtyp4> cutsq_sigma;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+  numtyp _cut_coulsq, _qqrd2e, _g_ewald;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_born_coul_long_ext.cpp b/lib/gpu/lal_born_coul_long_ext.cpp
new file mode 100644
index 0000000000..e8ac4eff0b
--- /dev/null
+++ b/lib/gpu/lal_born_coul_long_ext.cpp
@@ -0,0 +1,132 @@
+/***************************************************************************
+                           born_coul_long_ext.cpp
+                             -------------------
+                           Trung Dac Nguyen (ORNL)
+
+  Functions for LAMMPS access to born/coul/long acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_born_coul_long.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static BornCoulLong<PRECISION,ACC_PRECISION> BORNCLMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
+                    double **host_born1, double **host_born2, double **host_born3,
+                    double **host_a, double **host_c, double **host_d,
+                    double **sigma, double **offset, double *special_lj, 
+                    const int inum, const int nall, const int max_nbors, 
+                    const int maxspecial, const double cell_size, int &gpu_mode, 
+                    FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
+                    double *host_special_coul, const double qqrd2e,
+                    const double g_ewald) {
+  BORNCLMF.clear();
+  gpu_mode=BORNCLMF.device->gpu_mode();
+  double gpu_split=BORNCLMF.device->particle_split();
+  int first_gpu=BORNCLMF.device->first_device();
+  int last_gpu=BORNCLMF.device->last_device();
+  int world_me=BORNCLMF.device->world_me();
+  int gpu_rank=BORNCLMF.device->gpu_rank();
+  int procs_per_gpu=BORNCLMF.device->procs_per_gpu();
+
+  BORNCLMF.device->init_message(screen,"born/coul/long",first_gpu,last_gpu);
+
+  bool message=false;
+  if (BORNCLMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
+                          host_born3, host_a, host_c, host_d, sigma, offset, 
+                          special_lj, inum, nall, 300, maxspecial, cell_size, 
+                          gpu_split, screen, host_cut_ljsq, host_cut_coulsq, 
+                          host_special_coul, qqrd2e, g_ewald);
+
+  BORNCLMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
+                            host_born3, host_a, host_c, host_d, sigma, offset, 
+                            special_lj, inum, nall, 300, maxspecial, cell_size, 
+                            gpu_split, screen, host_cut_ljsq, host_cut_coulsq, 
+                            host_special_coul, qqrd2e, g_ewald);
+
+    BORNCLMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    BORNCLMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void borncl_gpu_clear() {
+  BORNCLMF.clear();
+}
+
+int** borncl_gpu_compute_n(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           double *sublo, double *subhi, int *tag, int **nspecial, 
+                           int **special, const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum,  const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd) {
+  return BORNCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                          subhi, tag, nspecial, special, eflag, vflag, eatom,
+                          vatom, host_start, ilist, jnum, cpu_time, success,
+                          host_q, boxlo, prd);
+}  
+			
+void borncl_gpu_compute(const int ago, const int inum_full, const int nall,
+                        double **host_x, int *host_type, int *ilist, int *numj,
+                        int **firstneigh, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        const double cpu_time, bool &success, double *host_q,
+                        const int nlocal, double *boxlo, double *prd) {
+  BORNCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                   firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+                   host_q,nlocal,boxlo,prd);
+}
+
+double borncl_gpu_bytes() {
+  return BORNCLMF.host_memory_usage();
+}
+
+
diff --git a/lib/gpu/lal_born_coul_wolf.cpp b/lib/gpu/lal_born_coul_wolf.cpp
new file mode 100644
index 0000000000..936ce7095b
--- /dev/null
+++ b/lib/gpu/lal_born_coul_wolf.cpp
@@ -0,0 +1,176 @@
+/***************************************************************************
+                             born_coul_wolf.cpp
+                             -------------------
+                           Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the born/coul/wolf pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "born_coul_wolf_cl.h"
+#elif defined(USE_CUDART)
+const char *born_coul_wolf=0;
+#else
+#include "born_coul_wolf_cubin.h"
+#endif
+
+#include "lal_born_coul_wolf.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define BornCoulWolfT BornCoulWolf<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+BornCoulWolfT::BornCoulWolf() : BaseCharge<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+BornCoulWolfT::~BornCoulWolfT() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int BornCoulWolfT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
+                        double **host_born1, double **host_born2, double **host_born3, 
+                        double **host_a, double **host_c, double **host_d, 
+                        double **host_sigma, double **host_offset, 
+                        double *host_special_lj, const int nlocal,
+                        const int nall, const int max_nbors,
+                        const int maxspecial, const double cell_size,
+                        const double gpu_split, FILE *_screen,
+                        double **host_cut_ljsq, const double host_cut_coulsq,
+                        double *host_special_coul, const double qqrd2e,
+                        const double alf, const double e_shift, const double f_shift) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,born_coul_wolf,"k_born_wolf");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv,
+                         host_born1,host_born2,host_born3);
+
+  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
+		                     host_d,host_offset);
+  
+  cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
+                         host_cut_ljsq,host_sigma);
+  
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _cut_coulsq=host_cut_coulsq;
+  _qqrd2e=qqrd2e;
+  _alf=alf;
+  _e_shift=e_shift;
+  _f_shift=f_shift;
+
+  _allocated=true;
+  this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()
+      +cutsq_sigma.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void BornCoulWolfT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  coeff1.clear();
+  coeff2.clear();
+  cutsq_sigma.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double BornCoulWolfT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(BornCoulWolf<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->atom->q,
+                          &cutsq_sigma, &_cut_coulsq, &_qqrd2e, 
+                          &_alf, &_e_shift, &_f_shift, 
+                          &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, 
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                   &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
+                   &nbor_pitch, &this->atom->q,
+                   &cutsq_sigma, &_cut_coulsq,
+                   &_qqrd2e, &_alf, &_e_shift, &_f_shift, 
+                   &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class BornCoulWolf<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_born_coul_wolf.cu b/lib/gpu/lal_born_coul_wolf.cu
new file mode 100644
index 0000000000..7461569db5
--- /dev/null
+++ b/lib/gpu/lal_born_coul_wolf.cu
@@ -0,0 +1,282 @@
+// **************************************************************************
+//                              born_coul_wolf.cu
+//                             -------------------
+//                           Trung Dac Nguyen (ORNL)
+//
+//  Device code for acceleration of the born/coul/wolf pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : nguyentd@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+#define MY_PIS (acctyp)1.77245385090551602729
+
+__kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
+                          __global numtyp4* coeff2, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag, 
+                          const int vflag, const int inum,
+                          const int nbor_pitch, __global numtyp *q_,
+                          __global numtyp4 *cutsq_sigma, 
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp alf, const numtyp e_shift, 
+                          const numtyp f_shift, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int itype=ix.w;
+
+    if (eflag>0) {
+      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
+      e_coul += (acctyp)2.0*e_self;
+    }
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<cutsq_sigma[mtype].x) { // cutsq
+        numtyp r2inv = ucl_recip(rsq);
+        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
+        numtyp v_sh = (numtyp)0.0;
+        numtyp rexp = (numtyp)0.0;
+        
+        if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
+          numtyp r = ucl_sqrt(rsq);
+          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
+          r6inv = r2inv*r2inv*r2inv;
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
+            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
+        } else forceborn = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r=ucl_rsqrt(r2inv);
+          numtyp arij = alf * r;
+          numtyp expm2 = ucl_exp(-arij*arij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*arij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
+          v_sh = (_erfc - e_shift*r)*prefactor; 
+          numtyp dvdrr = (_erfc/rsq + EWALD_F*alf*expm2/r) + f_shift;
+          forcecoul = prefactor * (dvdrr*rsq-factor_coul);
+        } else forcecoul = (numtyp)0.0;
+
+        force = (forceborn + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(v_sh-factor_coul);
+          if (rsq < coeff1[mtype].w) {
+            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+              + coeff2[mtype].z*r2inv*r6inv;
+            energy+=factor_lj*(e-coeff2[mtype].w);
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
+                               __global numtyp4* coeff2_in, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nbor_pitch, __global numtyp *q_,
+                               __global numtyp4 *cutsq_sigma,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp alf, const numtyp e_shift, 
+                               const numtyp f_shift, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    coeff1[tid]=coeff1_in[tid];
+    if (eflag>0)
+      coeff2[tid]=coeff2_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    if (eflag>0) {
+      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
+      e_coul += (acctyp)2.0*e_self;
+    }
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq_sigma[mtype].x) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
+        numtyp v_sh = (numtyp)0.0;
+        numtyp rexp = (numtyp)0.0;
+        
+        if (rsq < coeff1[mtype].w) {
+          numtyp r = ucl_sqrt(rsq);
+          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
+          r6inv = r2inv*r2inv*r2inv;
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv 
+            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
+        } else forceborn = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r=ucl_rsqrt(r2inv);
+          numtyp arij = alf * r;
+          numtyp expm2 = ucl_exp(-arij*arij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*arij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
+          v_sh = (_erfc - e_shift*r)*prefactor; 
+          numtyp dvdrr = (_erfc/rsq + EWALD_F*alf*expm2/r) + f_shift;
+          forcecoul = prefactor * (dvdrr*rsq-factor_coul);
+        } else forcecoul = (numtyp)0.0;
+
+        force = (forceborn + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(v_sh-factor_coul);
+          if (rsq < coeff1[mtype].w) {
+            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+              + coeff2[mtype].z*r2inv*r6inv;
+            energy+=factor_lj*(e-coeff2[mtype].w);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
diff --git a/lib/gpu/lal_born_coul_wolf.h b/lib/gpu/lal_born_coul_wolf.h
new file mode 100644
index 0000000000..9e02d23233
--- /dev/null
+++ b/lib/gpu/lal_born_coul_wolf.h
@@ -0,0 +1,89 @@
+/***************************************************************************
+                              born_coul_wolf.h
+                             -------------------
+                           Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the born/coul/wolf pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_BORN_COUL_LONG_H
+#define LAL_BORN_COUL_LONG_H
+
+#include "lal_base_charge.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
+ public:
+  BornCoulWolf();
+  ~BornCoulWolf();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_rhoinv, 
+           double **host_born1, double **host_born2, double **host_born3, 
+           double **host_a, double **host_c, double **host_d, 
+           double **host_sigma, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double alf, const double e_shift,
+           const double f_shift);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, 
+  /// coeff1.w = born3
+  UCL_D_Vec<numtyp4> coeff1;
+  /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
+  UCL_D_Vec<numtyp4> coeff2;
+  /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, 
+  /// cutsq_sigma.z = sigma
+  UCL_D_Vec<numtyp4> cutsq_sigma;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+  numtyp _cut_coulsq,_qqrd2e,_alf,_e_shift,_f_shift;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_born_coul_wolf_ext.cpp b/lib/gpu/lal_born_coul_wolf_ext.cpp
new file mode 100644
index 0000000000..3e779d099e
--- /dev/null
+++ b/lib/gpu/lal_born_coul_wolf_ext.cpp
@@ -0,0 +1,134 @@
+/***************************************************************************
+                           born_coul_wolf_ext.cpp
+                             -------------------
+                           Trung Dac Nguyen (ORNL)
+
+  Functions for LAMMPS access to born/coul/wolf acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_born_coul_wolf.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static BornCoulWolf<PRECISION,ACC_PRECISION> BORNCWMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
+                    double **host_born1, double **host_born2, double **host_born3, 
+                    double **host_a, double **host_c, double **host_d,
+                    double **sigma, double **offset, double *special_lj, const int inum,
+                    const int nall, const int max_nbors, const int maxspecial,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                    double **host_cut_ljsq, double host_cut_coulsq,
+                    double *host_special_coul, const double qqrd2e,
+                    const double alf, const double e_shift, const double f_shift) {
+  BORNCWMF.clear();
+  gpu_mode=BORNCWMF.device->gpu_mode();
+  double gpu_split=BORNCWMF.device->particle_split();
+  int first_gpu=BORNCWMF.device->first_device();
+  int last_gpu=BORNCWMF.device->last_device();
+  int world_me=BORNCWMF.device->world_me();
+  int gpu_rank=BORNCWMF.device->gpu_rank();
+  int procs_per_gpu=BORNCWMF.device->procs_per_gpu();
+
+  BORNCWMF.device->init_message(screen,"born/coul/wolf",first_gpu,last_gpu);
+
+  bool message=false;
+  if (BORNCWMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                          host_born3, host_a, host_c, host_d, sigma,
+                          offset, special_lj, inum, nall, 300, 
+                          maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                          host_cut_coulsq, host_special_coul, qqrd2e, 
+                          alf, e_shift, f_shift);
+
+  BORNCWMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
+                            host_born3, host_a, host_c, host_d, sigma, 
+                            offset, special_lj, inum, nall, 300, 
+                            maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                            host_cut_coulsq, host_special_coul, qqrd2e, 
+                            alf, e_shift, f_shift);
+
+    BORNCWMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    BORNCWMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void borncw_gpu_clear() {
+  BORNCWMF.clear();
+}
+
+int** borncw_gpu_compute_n(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           double *sublo, double *subhi, int *tag, int **nspecial, 
+                           int **special, const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum,  const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd) {
+  return BORNCWMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                          subhi, tag, nspecial, special, eflag, vflag, eatom,
+                          vatom, host_start, ilist, jnum, cpu_time, success,
+                          host_q, boxlo, prd);
+}  
+			
+void borncw_gpu_compute(const int ago, const int inum_full, const int nall,
+                        double **host_x, int *host_type, int *ilist, int *numj,
+                        int **firstneigh, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        const double cpu_time, bool &success, double *host_q,
+                        const int nlocal, double *boxlo, double *prd) {
+  BORNCWMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                   firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+                   host_q,nlocal,boxlo,prd);
+}
+
+double borncw_gpu_bytes() {
+  return BORNCWMF.host_memory_usage();
+}
+
+
diff --git a/lib/gpu/lal_born_ext.cpp b/lib/gpu/lal_born_ext.cpp
new file mode 100644
index 0000000000..7785353a8a
--- /dev/null
+++ b/lib/gpu/lal_born_ext.cpp
@@ -0,0 +1,124 @@
+/***************************************************************************
+                                 born_ext.cpp
+                             -------------------
+                           Trung Dac Nguyen (ORNL)
+
+  Functions for LAMMPS access to born acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_born.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static Born<PRECISION,ACC_PRECISION> BORNMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
+                  double **host_born1, double **host_born2, 
+                  double **host_born3, double **host_a, double **host_c, 
+                  double **host_d, double **sigma,      
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors,  const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen) {
+  BORNMF.clear();
+  gpu_mode=BORNMF.device->gpu_mode();
+  double gpu_split=BORNMF.device->particle_split();
+  int first_gpu=BORNMF.device->first_device();
+  int last_gpu=BORNMF.device->last_device();
+  int world_me=BORNMF.device->world_me();
+  int gpu_rank=BORNMF.device->gpu_rank();
+  int procs_per_gpu=BORNMF.device->procs_per_gpu();
+
+  BORNMF.device->init_message(screen,"born",first_gpu,last_gpu);
+
+  bool message=false;
+  if (BORNMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
+                        host_born3, host_a, host_c, host_d, sigma,
+                        offset, special_lj, inum, nall, 300,
+                        maxspecial, cell_size, gpu_split, screen);
+
+  BORNMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
+                          host_born3, host_a, host_c, host_d, sigma, 
+                          offset, special_lj, inum, nall, 300,
+                          maxspecial, cell_size, gpu_split, screen);
+
+    BORNMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    BORNMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void born_gpu_clear() {
+  BORNMF.clear(); 
+}
+
+int ** born_gpu_compute_n(const int ago, const int inum_full,
+                          const int nall, double **host_x, int *host_type,
+                          double *sublo, double *subhi, int *tag, int **nspecial,
+                          int **special, const bool eflag, const bool vflag,
+                          const bool eatom, const bool vatom, int &host_start,
+                          int **ilist, int **jnum, const double cpu_time,
+                          bool &success) {
+  return BORNMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success);
+}  
+			
+void born_gpu_compute(const int ago, const int inum_full, const int nall,
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success) {
+  BORNMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
+}
+
+double born_gpu_bytes() {
+  return BORNMF.host_memory_usage();
+}
+
+
diff --git a/lib/gpu/lal_colloid.cpp b/lib/gpu/lal_colloid.cpp
new file mode 100644
index 0000000000..cb6d23fc31
--- /dev/null
+++ b/lib/gpu/lal_colloid.cpp
@@ -0,0 +1,181 @@
+/***************************************************************************
+                                   colloid.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the colloid pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "colloid_cl.h"
+#elif defined(USE_CUDART)
+const char *colloid=0;
+#else
+#include "colloid_cubin.h"
+#endif
+
+#include "lal_colloid.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define ColloidT Colloid<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+ColloidT::Colloid() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+ColloidT::~Colloid() { 
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int ColloidT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int ColloidT::init(const int ntypes, 
+                   double **host_cutsq, double **host_lj1, 
+                   double **host_lj2, double **host_lj3, 
+                   double **host_lj4, double **host_offset, 
+                   double *host_special_lj, double **host_a12, 
+                   double **host_a1, double **host_a2, 
+                   double **host_d1, double **host_d2, 
+                   double **host_sigma3, double **host_sigma6,
+                   int **host_form, const int nlocal,
+                   const int nall, const int max_nbors,
+                   const int maxspecial, const double cell_size,
+                   const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,colloid,"k_colloid");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+                         host_cutsq);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+                         host_offset);
+
+  colloid1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,colloid1,host_write,host_a12,host_a1,
+                         host_a2);
+
+  colloid2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,colloid2,host_write,host_d1,host_d2,
+                         host_sigma3,host_sigma6);
+
+  UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device),
+                             UCL_WRITE_OPTIMIZED);
+  for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0;
+                                
+  form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<ntypes; i++)
+    for (int j=0; j<ntypes; j++) {
+      dview_form[i*lj_types+j]=host_form[i][j];
+  }
+  ucl_copy(form,dview_form,false);
+
+  UCL_H_Vec<double> dview;
+  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  dview.view(host_special_lj,4,*(this->ucl_device));
+  ucl_copy(sp_lj,dview,false);
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()
+        +colloid1.row_bytes()+colloid2.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void ColloidT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  colloid1.clear();
+  colloid2.clear();
+  form.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double ColloidT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(Colloid<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void ColloidT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &colloid1, &colloid2, &form,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
+                     &colloid1, &colloid2, &form, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag, 
+                     &ainum, &nbor_pitch, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class Colloid<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_colloid.cu b/lib/gpu/lal_colloid.cu
new file mode 100644
index 0000000000..74e249083d
--- /dev/null
+++ b/lib/gpu/lal_colloid.cu
@@ -0,0 +1,329 @@
+// **************************************************************************
+//                                   colloid.cu
+//                             -------------------
+//                           Trung Dac Nguyen (ORNL)
+//
+//  Device code for acceleration of the colloid pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : nguyentd@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
+#endif
+#else
+#define pos_tex x_
+#endif
+
+__kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
+                        __global numtyp4* lj3, const int lj_types, 
+                        __global numtyp *sp_lj_in, 
+                        __global numtyp4* colloid1, 
+                        __global numtyp4* colloid2,
+                        __global int *form, 
+                        __global int *dev_nbor, 
+                        __global int *dev_packed, __global acctyp4 *ans,
+                        __global acctyp *engv, const int eflag, 
+                        const int vflag, const int inum,
+                        const int nbor_pitch, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+        
+      int mtype=itype*lj_types+jtype;
+      if (rsq<lj1[mtype].z) {   
+        numtyp r,r2inv,r6inv;
+        numtyp c1,c2,fR,evdwl;
+        numtyp K[9],h[4],g[4];
+        numtyp force = (numtyp)0;
+ 
+        if (form[mtype]==0) { // SMALL_SMALL
+          r2inv=ucl_recip(rsq);
+          r6inv = r2inv*r2inv*r2inv;
+          force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+          force*=factor_lj;
+        } else if (form[mtype]==1) { // SMALL_LARGE
+          c2 = colloid1[mtype].z; 
+          K[1] = c2*c2;
+          K[2] = rsq;
+          K[0] = K[1] - rsq;
+          K[4] = rsq*rsq;
+          K[3] = K[1] - K[2];
+          K[3] *= K[3]*K[3];
+          K[6] = K[3]*K[3];
+          fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
+          force = 4.0/15.0*fR * 
+            (2.0*(K[1]+K[2]) * (K[1]*(5.0*K[1]+22.0*K[2])+5.0*K[4]) * 
+          colloid2[mtype].w/K[6]-5.0) / K[0];
+          force*=factor_lj;
+        } else if (form[mtype]==2) { // LARGE_LARGE
+          r = ucl_sqrt(rsq);
+          c1 = colloid1[mtype].y; 
+          c2 = colloid1[mtype].z; 
+          K[0] = c1*c2;
+          K[1] = c1+c2;
+          K[2] = c1-c2;
+          K[3] = K[1]+r;
+          K[4] = K[1]-r;
+          K[5] = K[2]+r;
+          K[6] = K[2]-r;
+          K[7] = ucl_recip(K[3]*K[4]);
+          K[8] = ucl_recip(K[5]*K[6]);
+          g[0] = ucl_powr(K[3],(numtyp)-7.0);
+          g[1] = -ucl_powr(-K[4],(numtyp)-7.0);
+          g[2] = ucl_powr(K[5],(numtyp)-7.0);
+          g[3] = -ucl_powr(-K[6],(numtyp)-7.0);
+          h[0] = ((K[3]+(numtyp)5.0*K[1])*K[3]+(numtyp)30.0*K[0])*g[0];
+          h[1] = ((K[4]+(numtyp)5.0*K[1])*K[4]+(numtyp)30.0*K[0])*g[1];
+          h[2] = ((K[5]+(numtyp)5.0*K[2])*K[5]-(numtyp)30.0*K[0])*g[2];
+          h[3] = ((K[6]+(numtyp)5.0*K[2])*K[6]-(numtyp)30.0*K[0])*g[3];
+          g[0] *= (numtyp)42.0*K[0]/K[3]+(numtyp)6.0*K[1]+K[3];
+          g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4];
+          g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5];
+          g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6];
+	
+          fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
+          evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
+          numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
+          numtyp dUA = -colloid1[mtype].x/3.0*r*(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
+            ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
+          force = factor_lj * (dUR+dUA)/r;
+        }
+  
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=(numtyp)0.0;
+          if (form[mtype]==0) {
+            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+          } else if (form[mtype]==1) {
+            e=(numtyp)2.0/(numtyp)9.0*fR * 
+              ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) +  
+              (numtyp)4.2*K[4])+K[2]*K[4]) * colloid2[mtype].w/K[6]);
+          } else if (form[mtype]==2) {
+            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
+          } 
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
+__kernel void k_colloid_fast(__global numtyp4 *x_, 
+                             __global numtyp4 *lj1_in,
+                             __global numtyp4 *lj3_in, 
+                             __global numtyp *sp_lj_in,
+                             __global numtyp4 *colloid1_in, 
+                             __global numtyp4 *colloid2_in,
+                             __global int *form_in, 
+                             __global int *dev_nbor, __global int *dev_packed, 
+                             __global acctyp4 *ans, __global acctyp *engv, 
+                             const int eflag, const int vflag, const int inum, 
+                             const int nbor_pitch, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+  
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 colloid1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 colloid2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[4];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    colloid1[tid]=colloid1_in[tid];
+    colloid2[tid]=colloid2_in[tid];
+    form[tid]=form_in[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+        
+      if (rsq<lj1[mtype].z) {
+        numtyp r,r2inv,r6inv;
+        numtyp c1,c2,fR,evdwl;
+        numtyp K[9],h[4],g[4];
+        numtyp force = (numtyp)0;
+ 
+        if (form[mtype]==0) { // SMALL_SMALL
+          r2inv=ucl_recip(rsq);
+          r6inv = r2inv*r2inv*r2inv;
+          force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+          force*=factor_lj;
+        } else if (form[mtype]==1) { // SMALL_LARGE
+          c2 = colloid1[mtype].z; 
+          K[1] = c2*c2;
+          K[2] = rsq;
+          K[0] = K[1] - rsq;
+          K[4] = rsq*rsq;
+          K[3] = K[1] - K[2];
+          K[3] *= K[3]*K[3];
+          K[6] = K[3]*K[3];
+          fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
+          force = (numtyp)4.0/(numtyp)15.0*fR * 
+            ((numtyp)2.0*(K[1]+K[2]) * (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) * 
+            colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
+          force*=factor_lj;
+        } else if (form[mtype]==2) { // LARGE_LARGE
+          r = ucl_sqrt(rsq);
+          c1 = colloid1[mtype].y; 
+          c2 = colloid1[mtype].z; 
+          K[0] = c1*c2;
+          K[1] = c1+c2;
+          K[2] = c1-c2;
+          K[3] = K[1]+r;
+          K[4] = K[1]-r;
+          K[5] = K[2]+r;
+          K[6] = K[2]-r;
+          K[7] = ucl_recip(K[3]*K[4]);
+          K[8] = ucl_recip(K[5]*K[6]);
+          g[0] = ucl_powr(K[3],(numtyp)-7.0);
+          g[1] = -ucl_powr(-K[4],(numtyp)-7.0);
+          g[2] = ucl_powr(K[5],(numtyp)-7.0);
+          g[3] = -ucl_powr(-K[6],(numtyp)-7.0);
+          h[0] = ((K[3]+(numtyp)5.0*K[1])*K[3]+(numtyp)30.0*K[0])*g[0];
+          h[1] = ((K[4]+(numtyp)5.0*K[1])*K[4]+(numtyp)30.0*K[0])*g[1];
+          h[2] = ((K[5]+(numtyp)5.0*K[2])*K[5]-(numtyp)30.0*K[0])*g[2];
+          h[3] = ((K[6]+(numtyp)5.0*K[2])*K[6]-(numtyp)30.0*K[0])*g[3];
+          g[0] *= (numtyp)42.0*K[0]/K[3]+(numtyp)6.0*K[1]+K[3];
+          g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4];
+          g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5];
+          g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6];
+	
+          fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
+          evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
+          numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
+          numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
+            ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
+          force = factor_lj * (dUR+dUA)/r;
+        } else force = (numtyp)0.0;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=(numtyp)0.0;
+          if (form[mtype]==0) {
+            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+          } else if (form[mtype]==1) {
+            e=(numtyp)2.0/(numtyp)9.0*fR * 
+              ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])*
+             colloid2[mtype].w/K[6]);
+          } else if (form[mtype]==2) {
+            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
+          } 
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
diff --git a/lib/gpu/lal_colloid.h b/lib/gpu/lal_colloid.h
new file mode 100644
index 0000000000..416beabcdf
--- /dev/null
+++ b/lib/gpu/lal_colloid.h
@@ -0,0 +1,89 @@
+/***************************************************************************
+                                    colloid.h
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the colloid pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_COLLOID_H
+#define LAL_COLLOID_H
+
+#include "lal_base_atomic.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class Colloid : public BaseAtomic<numtyp, acctyp> {
+ public:
+  Colloid();
+  ~Colloid(); 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           double **host_a12, double **host_a1, double **host_a2, 
+           double **host_d1, double **host_d2, double **host_sigma3, 
+           double **host_sigma6, int **host_form, 
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// colloid1.x = a12, colloid1.y = a1, colloid1.z = a2
+  UCL_D_Vec<numtyp4> colloid1;
+  /// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3, 
+  /// colloid2.w = sigma6
+  UCL_D_Vec<numtyp4> colloid2;
+  /// form
+  UCL_D_Vec<int> form;
+  /// Special LJ values
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_colloid_ext.cpp b/lib/gpu/lal_colloid_ext.cpp
new file mode 100644
index 0000000000..d4e8a2092b
--- /dev/null
+++ b/lib/gpu/lal_colloid_ext.cpp
@@ -0,0 +1,127 @@
+/***************************************************************************
+                                 colloid_ext.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Functions for LAMMPS access to colloid acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_colloid.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static Colloid<PRECISION,ACC_PRECISION> COLLMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                     double **host_lj2, double **host_lj3, double **host_lj4,
+                     double **offset, double *special_lj, 
+                     double **host_a12, double **host_a1, double **host_a2, 
+                     double **host_d1, double **host_d2, double **host_sigma3, 
+                     double **host_sigma6, int **host_form, const int inum,
+                     const int nall, const int max_nbors,  const int maxspecial,
+                     const double cell_size, int &gpu_mode, FILE *screen) {
+  COLLMF.clear();
+  gpu_mode=COLLMF.device->gpu_mode();
+  double gpu_split=COLLMF.device->particle_split();
+  int first_gpu=COLLMF.device->first_device();
+  int last_gpu=COLLMF.device->last_device();
+  int world_me=COLLMF.device->world_me();
+  int gpu_rank=COLLMF.device->gpu_rank();
+  int procs_per_gpu=COLLMF.device->procs_per_gpu();
+
+  COLLMF.device->init_message(screen,"colloid",first_gpu,last_gpu);
+
+  bool message=false;
+  if (COLLMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+                        host_lj4, offset, special_lj, host_a12, host_a1,
+                        host_a2, host_d1, host_d2, host_sigma3, 
+                        host_sigma6, host_form, inum, nall, 300,
+                        maxspecial, cell_size, gpu_split, screen);
+
+  COLLMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                          offset, special_lj, host_a12, host_a1, host_a2, 
+                          host_d1, host_d2, host_sigma3, host_sigma6, host_form, 
+                          inum, nall, 300, maxspecial,
+                          cell_size, gpu_split, screen);
+
+    COLLMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    COLLMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void colloid_gpu_clear() {
+  COLLMF.clear();
+}
+
+int ** colloid_gpu_compute_n(const int ago, const int inum_full,
+                             const int nall, double **host_x, int *host_type,
+                             double *sublo, double *subhi, int *tag, int **nspecial,
+                             int **special, const bool eflag, const bool vflag,
+                             const bool eatom, const bool vatom, int &host_start,
+                             int **ilist, int **jnum, const double cpu_time,
+                             bool &success) {
+  return COLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success);
+}  
+			
+void colloid_gpu_compute(const int ago, const int inum_full, const int nall,
+                         double **host_x, int *host_type, int *ilist, int *numj,
+                         int **firstneigh, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
+                         const double cpu_time, bool &success) {
+  COLLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
+}
+
+double colloid_gpu_bytes() {
+  return COLLMF.host_memory_usage();
+}
+
+
diff --git a/lib/gpu/lal_coul_dsf.cpp b/lib/gpu/lal_coul_dsf.cpp
new file mode 100644
index 0000000000..e597a325ff
--- /dev/null
+++ b/lib/gpu/lal_coul_dsf.cpp
@@ -0,0 +1,153 @@
+/***************************************************************************
+                                  coul_dsf.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the coul/dsf pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 8/15/2012
+    email                : nguyentdw@ornl.gov
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "coul_dsf_cl.h"
+#elif defined(USE_CUDART)
+const char *coul_dsf=0;
+#else
+#include "coul_dsf_cubin.h"
+#endif
+
+#include "lal_coul_dsf.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define CoulDSFT CoulDSF<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+CoulDSFT::CoulDSF() : BaseCharge<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+CoulDSFT::~CoulDSF() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int CoulDSFT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int CoulDSFT::init(const int ntypes, const int nlocal, const int nall, 
+                   const int max_nbors, const int maxspecial, 
+                   const double cell_size, const double gpu_split, FILE *_screen,
+                   const double host_cut_coulsq, double *host_special_coul,
+                   const double qqrd2e, const double e_shift, const double f_shift, 
+                   const double alpha) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,coul_dsf,"k_coul_dsf");
+  if (success!=0)
+    return success;
+
+  _cut_coulsq=host_cut_coulsq;
+  _e_shift=e_shift;
+  _f_shift=f_shift;
+  _alpha=alpha;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,4,false);
+
+  _qqrd2e=qqrd2e;
+
+  _allocated=true;
+  this->_max_bytes=sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void CoulDSFT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double CoulDSFT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(CoulDSF<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->x, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                          &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
+                          &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, 
+                     &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                     &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
+                     &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class CoulDSF<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_coul_dsf.cu b/lib/gpu/lal_coul_dsf.cu
new file mode 100644
index 0000000000..3ce1eccab3
--- /dev/null
+++ b/lib/gpu/lal_coul_dsf.cu
@@ -0,0 +1,214 @@
+// **************************************************************************
+//                                coul_dsf.cu
+//                             -------------------
+//                           Trung Dac Nguyen (ORNL)
+//
+//  Device code for acceleration of the coul/dsf pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 8/15/2012
+//    email                : nguyentd@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+#define MY_PIS (acctyp)1.77245385090551602729
+
+__kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types, 
+                         __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                         __global int *dev_packed, __global acctyp4 *ans,
+                         __global acctyp *engv, const int eflag,
+                         const int vflag, const int inum,
+                         const int nbor_pitch, __global numtyp *q_ ,
+                         const numtyp cut_coulsq, const numtyp qqrd2e,
+                         const numtyp e_shift, const numtyp f_shift, 
+                         const numtyp alpha, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_coul, r, prefactor, erfcc;
+      factor_coul = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq < cut_coulsq) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force;
+
+        r = ucl_sqrt(rsq);
+        fetch(prefactor,j,q_tex);
+        prefactor *= factor_coul * qqrd2e*qtmp/r;
+        numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
+        numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
+        erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
+        forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd + 
+                                 rsq*f_shift);
+        
+        force = forcecoul * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq) {
+            numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift);
+            e_coul += e;
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_coul_dsf_fast(__global numtyp4 *x_, __global numtyp* sp_lj_in,
+                              __global int *dev_nbor, __global int *dev_packed,
+                              __global acctyp4 *ans, __global acctyp *engv, 
+                              const int eflag, const int vflag, const int inum, 
+                              const int nbor_pitch, __global numtyp *q_,
+                              const numtyp cut_coulsq, const numtyp qqrd2e,
+                              const numtyp e_shift, const numtyp f_shift, 
+                              const numtyp alpha, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[4];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+ 
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_coul, r, prefactor, erfcc;
+      factor_coul = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq < cut_coulsq) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force;
+
+        r = ucl_sqrt(rsq);
+        fetch(prefactor,j,q_tex);
+        prefactor *= factor_coul * qqrd2e*qtmp/r;
+        numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
+        numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
+        erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
+        forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd + 
+                                   rsq*f_shift);
+        
+        force = forcecoul * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq) {
+            numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift);
+            e_coul += e;
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
diff --git a/lib/gpu/lal_coul_dsf.h b/lib/gpu/lal_coul_dsf.h
new file mode 100644
index 0000000000..0579ab46d5
--- /dev/null
+++ b/lib/gpu/lal_coul_dsf.h
@@ -0,0 +1,78 @@
+/***************************************************************************
+                                  coul_dsf.h
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the coul/dsf pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 8/15/2012
+    email                : nguyentdw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_LJ_DSF_H
+#define LAL_LJ_DSF_H
+
+#include "lal_base_charge.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class CoulDSF : public BaseCharge<numtyp, acctyp> {
+ public:
+  CoulDSF();
+  ~CoulDSF();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, const int nlocal, const int nall, 
+           const int max_nbors, const int maxspecial, 
+           const double cell_size, const double gpu_split, FILE *screen,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double e_shift, const double f_shift, 
+           const double alpha);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+  numtyp _qqrd2e;
+
+ private:
+  bool _allocated;
+  numtyp _e_shift, _f_shift, _alpha, _cut_coulsq;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_coul_dsf_ext.cpp b/lib/gpu/lal_coul_dsf_ext.cpp
new file mode 100644
index 0000000000..19879cb9c5
--- /dev/null
+++ b/lib/gpu/lal_coul_dsf_ext.cpp
@@ -0,0 +1,125 @@
+/***************************************************************************
+                               coul_dsf_ext.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Functions for LAMMPS access to coul/dsf acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 8/15/2012
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_coul_dsf.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static CoulDSF<PRECISION,ACC_PRECISION> CDMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int cdsf_gpu_init(const int ntypes, const int inum, const int nall, 
+                  const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  const double host_cut_coulsq, double *host_special_coul, 
+                  const double qqrd2e, const double e_shift, const double f_shift, 
+                  const double alpha) {
+  CDMF.clear();
+  gpu_mode=CDMF.device->gpu_mode();
+  double gpu_split=CDMF.device->particle_split();
+  int first_gpu=CDMF.device->first_device();
+  int last_gpu=CDMF.device->last_device();
+  int world_me=CDMF.device->world_me();
+  int gpu_rank=CDMF.device->gpu_rank();
+  int procs_per_gpu=CDMF.device->procs_per_gpu();
+
+  CDMF.device->init_message(screen,"coul/dsf",first_gpu,last_gpu);
+
+  bool message=false;
+  if (CDMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, 
+                      gpu_split, screen, host_cut_coulsq, host_special_coul, 
+                      qqrd2e, e_shift, f_shift, alpha);
+
+  CDMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, 
+                        gpu_split, screen, host_cut_coulsq, host_special_coul, 
+                        qqrd2e, e_shift, f_shift, alpha);
+
+    CDMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    CDMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void cdsf_gpu_clear() {
+  CDMF.clear();
+}
+
+int** cdsf_gpu_compute_n(const int ago, const int inum_full,
+                         const int nall, double **host_x, int *host_type,
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
+                         int **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
+                         int **ilist, int **jnum, const double cpu_time,
+                         bool &success, double *host_q, double *boxlo,
+                         double *prd) {
+  return CDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                      subhi, tag, nspecial, special, eflag, vflag, eatom,
+                      vatom, host_start, ilist, jnum, cpu_time, success,
+                      host_q, boxlo, prd);
+}  
+			
+void cdsf_gpu_compute(const int ago, const int inum_full, const int nall,
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success, double *host_q,
+                      const int nlocal, double *boxlo, double *prd) {
+  CDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
+               vflag,eatom,vatom,host_start,cpu_time,success,host_q,
+               nlocal,boxlo,prd);
+}
+
+double cdsf_gpu_bytes() {
+  return CDMF.host_memory_usage();
+}
+
+
diff --git a/lib/gpu/lal_dipole_lj.cpp b/lib/gpu/lal_dipole_lj.cpp
new file mode 100644
index 0000000000..41ec84648d
--- /dev/null
+++ b/lib/gpu/lal_dipole_lj.cpp
@@ -0,0 +1,170 @@
+/***************************************************************************
+                                 dipole_lj.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the dipole/cut pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "dipole_lj_cl.h"
+#elif defined(USE_CUDART)
+const char *dipole_lj=0;
+#else
+#include "dipole_lj_cubin.h"
+#endif
+
+#include "lal_dipole_lj.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define DipoleLJT DipoleLJ<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+DipoleLJT::DipoleLJ() : BaseDipole<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+DipoleLJT::~DipoleLJ() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int DipoleLJT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int DipoleLJT::init(const int ntypes,
+                    double **host_cutsq, double **host_lj1, 
+                    double **host_lj2, double **host_lj3, 
+                    double **host_lj4, double **host_offset,
+                    double *host_special_lj, const int nlocal,
+                    const int nall, const int max_nbors,
+                    const int maxspecial, const double cell_size,
+                    const double gpu_split, FILE *_screen,
+                    double **host_cut_ljsq, double **host_cut_coulsq,
+                    double *host_special_coul, const double qqrd2e) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,dipole_lj,"k_dipole_lj");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+                         host_cut_ljsq, host_cut_coulsq);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+                         host_offset);
+
+  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
+
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _qqrd2e=qqrd2e;
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
+                   sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void DipoleLJT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  cutsq.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double DipoleLJT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(DipoleLJ<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void DipoleLJT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->atom->q, 
+                          &this->atom->quat, &cutsq,
+                          &_qqrd2e, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &lj1, &lj3,
+                     &_lj_types, &sp_lj, &this->nbor->dev_nbor,
+                     &this->_nbor_data->begin(), &this->ans->force,
+                     &this->ans->engv, &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->atom->q, 
+                     &this->atom->quat, &cutsq, 
+                     &_qqrd2e, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class DipoleLJ<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_dipole_lj.cu b/lib/gpu/lal_dipole_lj.cu
new file mode 100644
index 0000000000..6ab0b3037d
--- /dev/null
+++ b/lib/gpu/lal_dipole_lj.cu
@@ -0,0 +1,501 @@
+// **************************************************************************
+//                                dipole_lj.cu
+//                             -------------------
+//                           Trung Dac Nguyen (ORNL)
+//
+//  Device code for acceleration of the dipole/cut pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : nguyentd@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+
+#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid,      \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    __local acctyp red_acc[8][BLOCK_PAIR];                                  \
+    red_acc[0][tid]=f.x;                                                    \
+    red_acc[1][tid]=f.y;                                                    \
+    red_acc[2][tid]=f.z;                                                    \
+    red_acc[3][tid]=tor.x;                                                  \
+    red_acc[4][tid]=tor.y;                                                  \
+    red_acc[5][tid]=tor.z;                                                  \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      if (offset < s) {                                                     \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    f.x=red_acc[0][tid];                                                    \
+    f.y=red_acc[1][tid];                                                    \
+    f.z=red_acc[2][tid];                                                    \
+    tor.x=red_acc[3][tid];                                                  \
+    tor.y=red_acc[4][tid];                                                  \
+    tor.z=red_acc[5][tid];                                                  \
+    if (eflag>0 || vflag>0) {                                               \
+      for (int r=0; r<6; r++)                                               \
+        red_acc[r][tid]=virial[r];                                          \
+      red_acc[6][tid]=energy;                                               \
+      red_acc[7][tid]=ecoul;                                                \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+        if (offset < s) {                                                   \
+          for (int r=0; r<8; r++)                                           \
+            red_acc[r][tid] += red_acc[r][tid+s];                           \
+        }                                                                   \
+      }                                                                     \
+      for (int r=0; r<6; r++)                                               \
+        virial[r]=red_acc[r][tid];                                          \
+      energy=red_acc[6][tid];                                               \
+      ecoul=red_acc[7][tid];                                                \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv=energy;                                                         \
+      engv+=inum;                                                           \
+      *engv=e_coul;                                                         \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv=virial[i];                                                    \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+    ans[ii+inum]=tor;                                                       \
+  }
+
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+texture<float4> mu_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+texture<int4,1> mu_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#define mu_tex mu_
+#endif
+
+__kernel void k_dipole_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum,
+                          const int nbor_pitch, __global numtyp *q_ ,
+                           __global numtyp4 *mu_,
+                          __global numtyp *cutsq, const numtyp qqrd2e,
+                          const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp4 tor;
+  tor.x=(acctyp)0;
+  tor.y=(acctyp)0;
+  tor.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
+    int itype=ix.w;
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      numtyp qj; fetch(qj,j,q_tex);
+      numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<cutsq[mtype]) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp force_lj, r6inv;
+        numtyp rinv, r3inv, r5inv, r7inv;
+        numtyp pre1, pre2, pre3, pre4;
+        numtyp pdotp, pidotr, pjdotr;
+        acctyp4 forcecoul, ticoul;
+        acctyp4 force;
+
+        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
+        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
+
+        if (rsq < lj1[mtype].z) {
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
+        } else force_lj = (numtyp)0.0;
+
+        if (rsq < lj1[mtype].w) {
+          rinv = ucl_rsqrt(rsq);
+
+          // charge-charge
+          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) { 
+            r3inv = r2inv*rinv;
+            pre1 = qtmp*qj*r3inv;
+
+            forcecoul.x += pre1*delx;
+            forcecoul.y += pre1*dely;
+            forcecoul.z += pre1*delz;
+          }                    
+
+          // dipole-dipole
+          if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
+            r3inv = r2inv*rinv; 
+            r5inv = r3inv*r2inv;
+	          r7inv = r5inv*r2inv;
+            pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
+            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
+            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
+
+            pre1 = (numtyp)3.0*r5inv*pdotp - (numtyp)15.0*r7inv*pidotr*pjdotr;
+            pre2 = (numtyp)3.0*r5inv*pjdotr;
+            pre3 = (numtyp)3.0*r5inv*pidotr;
+            pre4 = (numtyp)(-1.0)*r3inv;
+
+            forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x;
+            forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y;
+            forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z;
+	    
+            numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
+            numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
+            numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
+
+            ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
+            ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
+            ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
+          }
+
+          // dipole-charge
+          if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
+            r3inv = r2inv*rinv; 
+            r5inv = r3inv*r2inv;
+            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
+            pre1 = (numtyp)3.0*qj*r5inv * pidotr;
+            pre2 = qj*r3inv;
+              
+            forcecoul.x += pre2*mui.x - pre1*delx;
+            forcecoul.y += pre2*mui.y - pre1*dely;
+            forcecoul.z += pre2*mui.z - pre1*delz;
+            ticoul.x += pre2 * (mui.y*delz - mui.z*dely);
+            ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
+            ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
+          }
+          
+          // charge-dipole
+          if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
+            r3inv = r2inv*rinv;
+            r5inv = r3inv*r2inv;
+            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
+            pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr;
+            pre2 = qtmp*r3inv;
+            
+            forcecoul.x += pre1*delx - pre2*muj.x;
+            forcecoul.y += pre1*dely - pre2*muj.y;
+            forcecoul.z += pre1*delz - pre2*muj.z;
+          }
+        } else {
+          forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
+          ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
+        }
+
+        numtyp fq = factor_coul*qqrd2e;
+        force.x = fq*forcecoul.x + delx*force_lj;
+        force.y = fq*forcecoul.y + dely*force_lj;
+        force.z = fq*forcecoul.z + delz*force_lj;
+        f.x+=force.x;
+        f.y+=force.y;
+        f.z+=force.z;
+        tor.x+=fq*ticoul.x;
+        tor.y+=fq*ticoul.y;
+        tor.z+=fq*ticoul.z;
+
+        if (eflag>0) {
+          acctyp e = (acctyp)0.0;  
+          if (rsq < lj1[mtype].w) {
+            e = qtmp*qj*rinv;
+            if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
+              e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr;
+            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) 
+              e += -qj*r3inv*pidotr;
+            if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
+              e += qtmp*r3inv*pjdotr;
+            e *= fq;
+          } else e = (acctyp)0.0;
+            e_coul += e;
+
+          if (rsq < lj1[mtype].z) {
+            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*force.x;
+          virial[1] += dely*force.y;
+          virial[2] += delz*force.z;
+          virial[3] += delx*force.y;
+          virial[4] += delx*force.z;
+          virial[5] += dely*force.z;
+        }
+      }
+
+    } // for nbor
+    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_dipole_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nbor_pitch, __global numtyp *q_,
+                               __global numtyp4 *mu_,
+                               __global numtyp *_cutsq, const numtyp qqrd2e,
+                               const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    cutsq[tid]=_cutsq[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp4 tor;
+  tor.x=(acctyp)0;
+  tor.y=(acctyp)0;
+  tor.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      numtyp qj; fetch(qj,j,q_tex);
+      numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq[mtype]) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp force_lj, r6inv;
+        numtyp rinv, r3inv, r5inv, r7inv;
+        numtyp pre1, pre2, pre3, pre4;
+        numtyp pdotp, pidotr, pjdotr;
+        acctyp4 forcecoul, ticoul;
+        acctyp4 force;
+        
+        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
+        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
+
+        if (rsq < lj1[mtype].z) {
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
+        } else force_lj = (numtyp)0.0;
+
+        if (rsq < lj1[mtype].w) {
+          rinv = ucl_rsqrt(rsq);
+
+          // charge-charge
+          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) { 
+            r3inv = r2inv*rinv;
+            pre1 = qtmp*qj*r3inv;
+
+            forcecoul.x += pre1*delx;
+            forcecoul.y += pre1*dely;
+            forcecoul.z += pre1*delz;
+          }
+
+          // dipole-dipole
+          if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
+            r3inv = r2inv*rinv; 
+            r5inv = r3inv*r2inv;
+            r7inv = r5inv*r2inv;
+            pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
+            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
+            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
+
+            pre1 = (numtyp)3.0*r5inv*pdotp - (numtyp)15.0*r7inv*pidotr*pjdotr;
+            pre2 = (numtyp)3.0*r5inv*pjdotr;
+            pre3 = (numtyp)3.0*r5inv*pidotr;
+            pre4 = (numtyp)(-1.0)*r3inv;
+
+            forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x;
+            forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y;
+            forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z;
+	    
+            numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
+            numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
+            numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
+
+            ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
+            ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
+            ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
+          }
+
+          // dipole-charge
+          if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { 
+            r3inv = r2inv*rinv;
+            r5inv = r3inv*r2inv;
+            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
+            pre1 = (numtyp)3.0*qj*r5inv * pidotr;
+            pre2 = qj*r3inv;
+            
+            forcecoul.x += pre2*mui.x - pre1*delx;
+            forcecoul.y += pre2*mui.y - pre1*dely;
+            forcecoul.z += pre2*mui.z - pre1*delz;
+            ticoul.x += pre2 * (mui.y*delz - mui.z*dely);
+            ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
+            ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
+          }
+          
+          // charge-dipole
+          if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
+            r3inv = r2inv*rinv;
+            r5inv = r3inv*r2inv;
+            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
+            pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr;
+            pre2 = qtmp*r3inv;
+            
+            forcecoul.x += pre1*delx - pre2*muj.x;
+            forcecoul.y += pre1*dely - pre2*muj.y;
+            forcecoul.z += pre1*delz - pre2*muj.z;
+          }
+        } else {
+          forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
+          ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
+        }
+
+        numtyp fq = factor_coul*qqrd2e;
+        force.x = fq*forcecoul.x + delx*force_lj;
+        force.y = fq*forcecoul.y + dely*force_lj;
+        force.z = fq*forcecoul.z + delz*force_lj;
+
+        f.x+=force.x;
+        f.y+=force.y;
+        f.z+=force.z;
+        tor.x+=fq*ticoul.x;
+        tor.y+=fq*ticoul.y;
+        tor.z+=fq*ticoul.z;
+
+        if (eflag>0) {
+          acctyp e = (acctyp)0;  
+          if (rsq < lj1[mtype].w) {
+            e = qtmp*qj*rinv;
+            if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
+              e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr;
+            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) 
+              e += -qj*r3inv*pidotr;
+            if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
+              e += qtmp*r3inv*pjdotr;
+            e *= fq;
+          } else e = (acctyp)0;
+          e_coul += e;
+
+          if (rsq < lj1[mtype].z) {
+            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*force.x;
+          virial[1] += dely*force.y;
+          virial[2] += delz*force.z;
+          virial[3] += delx*force.y;
+          virial[4] += delx*force.z;
+          virial[5] += dely*force.z;
+        }
+      }
+
+    } // for nbor
+    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
diff --git a/lib/gpu/lal_dipole_lj.h b/lib/gpu/lal_dipole_lj.h
new file mode 100644
index 0000000000..b08b7a8669
--- /dev/null
+++ b/lib/gpu/lal_dipole_lj.h
@@ -0,0 +1,85 @@
+/***************************************************************************
+                                  dipole_lj.h
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the dipole/cut pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_DIPOLE_LJ_H
+#define LAL_DIPOLE_LJ_H
+
+#include "lal_base_dipole.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class DipoleLJ : public BaseDipole<numtyp, acctyp> {
+ public:
+  DipoleLJ();
+  ~DipoleLJ();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           double **host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// cutsq
+  UCL_D_Vec<numtyp> cutsq;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+  numtyp _qqrd2e;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_dipole_lj_ext.cpp b/lib/gpu/lal_dipole_lj_ext.cpp
new file mode 100644
index 0000000000..05d8fd9f72
--- /dev/null
+++ b/lib/gpu/lal_dipole_lj_ext.cpp
@@ -0,0 +1,128 @@
+/***************************************************************************
+                               dipole_lj_ext.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Functions for LAMMPS access to dipole/cut acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_dipole_lj.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static DipoleLJ<PRECISION,ACC_PRECISION> DPLMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4,
+                 double **offset, double *special_lj, const int inum,
+                 const int nall, const int max_nbors, const int maxspecial,
+                 const double cell_size, int &gpu_mode, FILE *screen,
+                 double **host_cut_ljsq, double **host_cut_coulsq,
+                 double *host_special_coul, const double qqrd2e) {
+  DPLMF.clear();
+  gpu_mode=DPLMF.device->gpu_mode();
+  double gpu_split=DPLMF.device->particle_split();
+  int first_gpu=DPLMF.device->first_device();
+  int last_gpu=DPLMF.device->last_device();
+  int world_me=DPLMF.device->world_me();
+  int gpu_rank=DPLMF.device->gpu_rank();
+  int procs_per_gpu=DPLMF.device->procs_per_gpu();
+
+  DPLMF.device->init_message(screen,"dipole/cut",first_gpu,last_gpu);
+
+  bool message=false;
+  if (DPLMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                       host_cut_coulsq, host_special_coul, qqrd2e);
+
+  DPLMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split, screen, host_cut_ljsq,
+                         host_cut_coulsq, host_special_coul, qqrd2e);
+
+    DPLMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    DPLMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void dpl_gpu_clear() {
+  DPLMF.clear();
+}
+
+int** dpl_gpu_compute_n(const int ago, const int inum_full,
+                        const int nall, double **host_x, int *host_type,
+                        double *sublo, double *subhi, int *tag, int **nspecial, 
+                        int **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success, double *host_q, double **host_mu, 
+                        double *boxlo, double *prd) {
+  return DPLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, ilist, jnum, cpu_time, success,
+                       host_q, host_mu, boxlo, prd);
+}  
+			
+void dpl_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success, double *host_q,
+                     double **host_mu, const int nlocal, double *boxlo, double *prd) {
+  DPLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
+                vflag,eatom,vatom,host_start,cpu_time,success,host_q,host_mu,
+                nlocal,boxlo,prd);
+}
+
+double dpl_gpu_bytes() {
+  return DPLMF.host_memory_usage();
+}
+
+
diff --git a/lib/gpu/lal_dipole_lj_sf.cpp b/lib/gpu/lal_dipole_lj_sf.cpp
new file mode 100644
index 0000000000..4e962dc8c7
--- /dev/null
+++ b/lib/gpu/lal_dipole_lj_sf.cpp
@@ -0,0 +1,170 @@
+/***************************************************************************
+                                 dipole_lj_sf.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the dipole/sf pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "dipole_lj_sf_cl.h"
+#elif defined(USE_CUDART)
+const char *dipole_lj_sf=0;
+#else
+#include "dipole_lj_sf_cubin.h"
+#endif
+
+#include "lal_dipole_lj_sf.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define DipoleLJSFT DipoleLJSF<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+DipoleLJSFT::DipoleLJSF() : BaseDipole<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+DipoleLJSFT::~DipoleLJSF() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int DipoleLJSFT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int DipoleLJSFT::init(const int ntypes,
+                      double **host_cutsq, double **host_lj1, 
+                      double **host_lj2, double **host_lj3, 
+                      double **host_lj4,
+                      double *host_special_lj, const int nlocal,
+                      const int nall, const int max_nbors,
+                      const int maxspecial, const double cell_size,
+                      const double gpu_split, FILE *_screen,
+                      double **host_cut_ljsq, double **host_cut_coulsq,
+                      double *host_special_coul, const double qqrd2e) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,dipole_lj_sf,"k_dipole_lj_sf");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+                         host_cut_ljsq, host_cut_coulsq);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+                         host_cutsq);
+
+  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
+
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _qqrd2e=qqrd2e;
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
+                   sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void DipoleLJSFT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  cutsq.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double DipoleLJSFT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(DipoleLJSF<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->atom->q, 
+                          &this->atom->quat, &cutsq,
+                          &_qqrd2e, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &lj1, &lj3,
+                     &_lj_types, &sp_lj, &this->nbor->dev_nbor,
+                     &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag, 
+                     &ainum, &nbor_pitch, &this->atom->q, 
+                     &this->atom->quat, &cutsq, 
+                     &_qqrd2e, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class DipoleLJSF<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_dipole_lj_sf.cu b/lib/gpu/lal_dipole_lj_sf.cu
new file mode 100644
index 0000000000..16d5463cb1
--- /dev/null
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@@ -0,0 +1,562 @@
+// **************************************************************************
+//                                dipole_lj_sf.cu
+//                             -------------------
+//                           Trung Dac Nguyen (ORNL)
+//
+//  Device code for acceleration of the dipole/sf pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : nguyentd@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+
+#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid,      \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    __local acctyp red_acc[8][BLOCK_PAIR];                                  \
+    red_acc[0][tid]=f.x;                                                    \
+    red_acc[1][tid]=f.y;                                                    \
+    red_acc[2][tid]=f.z;                                                    \
+    red_acc[3][tid]=tor.x;                                                  \
+    red_acc[4][tid]=tor.y;                                                  \
+    red_acc[5][tid]=tor.z;                                                  \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      if (offset < s) {                                                     \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    f.x=red_acc[0][tid];                                                    \
+    f.y=red_acc[1][tid];                                                    \
+    f.z=red_acc[2][tid];                                                    \
+    tor.x=red_acc[3][tid];                                                  \
+    tor.y=red_acc[4][tid];                                                  \
+    tor.z=red_acc[5][tid];                                                  \
+    if (eflag>0 || vflag>0) {                                               \
+      for (int r=0; r<6; r++)                                               \
+        red_acc[r][tid]=virial[r];                                          \
+      red_acc[6][tid]=energy;                                               \
+      red_acc[7][tid]=ecoul;                                                \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+        if (offset < s) {                                                   \
+          for (int r=0; r<8; r++)                                           \
+            red_acc[r][tid] += red_acc[r][tid+s];                           \
+        }                                                                   \
+      }                                                                     \
+      for (int r=0; r<6; r++)                                               \
+        virial[r]=red_acc[r][tid];                                          \
+      energy=red_acc[6][tid];                                               \
+      ecoul=red_acc[7][tid];                                                \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv=energy;                                                         \
+      engv+=inum;                                                           \
+      *engv=e_coul;                                                         \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv=virial[i];                                                    \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+    ans[ii+inum]=tor;                                                       \
+  }
+
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+texture<float4> mu_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+texture<int4,1> mu_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#define mu_tex mu_
+#endif
+
+__kernel void k_dipole_lj_sf(__global numtyp4 *x_, __global numtyp4 *lj1,
+                             __global numtyp4* lj3, const int lj_types, 
+                             __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                             __global int *dev_packed, __global acctyp4 *ans,
+                             __global acctyp *engv, const int eflag,
+                             const int vflag, const int inum,
+                             const int nbor_pitch, __global numtyp *q_ ,
+                             __global numtyp4 *mu_,
+                             __global numtyp *cutsq, const numtyp qqrd2e,
+                             const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp4 tor;
+  tor.x=(acctyp)0;
+  tor.y=(acctyp)0;
+  tor.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
+    int itype=ix.w;
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      numtyp qj; fetch(qj,j,q_tex);
+      numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<cutsq[mtype]) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp force_lj, r6inv;
+        numtyp rinv, r3inv, r5inv;
+        numtyp pre1, pre2, pre4;
+        numtyp pdotp, pidotr, pjdotr;
+        numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
+        numtyp4 aforcecoul, bforcecoul;
+
+        acctyp4 forcecoul, ticoul;
+        acctyp4 force;
+
+        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
+        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
+
+        if (rsq < lj1[mtype].z) {
+          r6inv = r2inv*r2inv*r2inv;
+          numtyp forceljcut = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
+
+          rcutlj2inv = ucl_recip(lj1[mtype].z);
+          rcutlj6inv = rcutlj2inv * rcutlj2inv * rcutlj2inv;
+          numtyp forceljsf = rcutlj6inv*(lj1[mtype].x*rcutlj6inv-lj1[mtype].y)*rcutlj2inv;
+
+          force_lj = factor_lj * (forceljcut - forceljsf);
+        } else force_lj = (numtyp)0.0;
+
+        if (rsq < lj1[mtype].w) {
+          rinv = ucl_rsqrt(rsq);
+          rcutcoul2inv = ucl_recip(lj1[mtype].w);
+
+          // charge-charge
+          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) { 
+            r3inv = r2inv*rinv;
+            pre1 = qtmp*qj*rinv*(r2inv-rcutcoul2inv);
+
+            forcecoul.x += pre1*delx;
+            forcecoul.y += pre1*dely;
+            forcecoul.z += pre1*delz;
+          }                    
+
+          // dipole-dipole
+          if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
+            r3inv = r2inv*rinv; 
+            r5inv = r3inv*r2inv;
+	          
+            pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
+            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
+            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
+            
+            afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv;
+            pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr);
+            aforcecoul.x = pre1*delx;
+            aforcecoul.y = pre1*dely;
+            aforcecoul.z = pre1*delz;
+ 	    
+            bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+
+              (numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv;
+            presf = (numtyp)2.0*r2inv*pidotr*pjdotr;
+            bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx);
+            bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely);
+            bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz);
+   
+            forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x);
+            forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y);
+            forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z);
+            
+            pre2 = (numtyp)3.0*bfac*r5inv*pjdotr;
+            pre4 = -bfac*r3inv;
+
+            numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
+            numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
+            numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
+  
+            ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
+            ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
+            ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
+          }
+
+          // dipole-charge
+          if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
+            r3inv = r2inv*rinv; 
+            r5inv = r3inv*r2inv;
+            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
+            rcutcoul2inv=ucl_recip(lj1[mtype].w);
+            pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv);
+            pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + 
+              (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
+            pre2 = qj*r3inv * pqfac;
+
+            forcecoul.x += pre2*mui.x - pre1*delx;
+            forcecoul.y += pre2*mui.y - pre1*dely;
+            forcecoul.z += pre2*mui.z - pre1*delz;
+            ticoul.x += pre2 * (mui.y*delz - mui.z*dely);
+            ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
+            ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
+          }
+          
+          // charge-dipole
+          if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
+            r3inv = r2inv*rinv;
+            r5inv = r3inv*r2inv;
+            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
+            rcutcoul2inv=ucl_recip(lj1[mtype].w);
+            pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv);
+            qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + 
+              (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
+            pre2 = qtmp*r3inv * qpfac;
+            
+            forcecoul.x += pre1*delx - pre2*muj.x;
+            forcecoul.y += pre1*dely - pre2*muj.y;
+            forcecoul.z += pre1*delz - pre2*muj.z;
+          }
+        } else {
+          forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
+          ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
+        }
+
+        numtyp fq = factor_coul*qqrd2e;
+        force.x = fq*forcecoul.x + delx*force_lj;
+        force.y = fq*forcecoul.y + dely*force_lj;
+        force.z = fq*forcecoul.z + delz*force_lj;
+        f.x+=force.x;
+        f.y+=force.y;
+        f.z+=force.z;
+        tor.x+=fq*ticoul.x;
+        tor.y+=fq*ticoul.y;
+        tor.z+=fq*ticoul.z;
+
+        if (eflag>0) {
+          acctyp e = (acctyp)0.0;  
+          if (rsq < lj1[mtype].w) {
+            numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv);
+            e = qtmp*qj*rinv*fac*fac;
+            if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
+              e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr);
+            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) 
+              e += -qj*r3inv*pidotr * pqfac;
+            if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
+              e += qtmp*r3inv*pjdotr * qpfac;
+              e *= fq;
+          } else e = (acctyp)0.0;
+          e_coul += e;
+
+          if (rsq < lj1[mtype].z) {
+            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) +
+              rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - 
+              (numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv +
+              rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + 
+              (numtyp)4.0*lj3[mtype].y);
+            energy+=factor_lj*e;
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*force.x;
+          virial[1] += dely*force.y;
+          virial[2] += delz*force.z;
+          virial[3] += delx*force.y;
+          virial[4] += delx*force.z;
+          virial[5] += dely*force.z;
+        }
+      }
+    } // for nbor
+    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_dipole_lj_sf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                                  __global numtyp4* lj3_in, 
+                                  __global numtyp* sp_lj_in,
+                                  __global int *dev_nbor, __global int *dev_packed,
+                                  __global acctyp4 *ans, __global acctyp *engv, 
+                                  const int eflag, const int vflag, const int inum, 
+                                  const int nbor_pitch, __global numtyp *q_,
+                                  __global numtyp4 *mu_,
+                                  __global numtyp *_cutsq, const numtyp qqrd2e,
+                                  const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    cutsq[tid]=_cutsq[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp4 tor;
+  tor.x=(acctyp)0;
+  tor.y=(acctyp)0;
+  tor.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      numtyp qj; fetch(qj,j,q_tex);
+      numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq[mtype]) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp force_lj, r6inv;
+        numtyp rinv, r3inv, r5inv;
+        numtyp pre1, pre2, pre4;
+        numtyp pdotp, pidotr, pjdotr;
+        numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
+        numtyp4 aforcecoul, bforcecoul;
+
+        acctyp4 forcecoul, ticoul;
+        acctyp4 force;
+
+        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
+        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
+
+        if (rsq < lj1[mtype].z) {
+          r6inv = r2inv*r2inv*r2inv;
+          numtyp forceljcut = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
+
+          rcutlj2inv = ucl_recip(lj1[mtype].z);
+          rcutlj6inv = rcutlj2inv * rcutlj2inv * rcutlj2inv;
+          numtyp forceljsf = rcutlj6inv*(lj1[mtype].x*rcutlj6inv-lj1[mtype].y)*rcutlj2inv;
+
+          force_lj = factor_lj * (forceljcut - forceljsf);
+        } else force_lj = (numtyp)0.0;
+
+        if (rsq < lj1[mtype].w) {
+          rinv = ucl_rsqrt(rsq);
+          rcutcoul2inv = ucl_recip(lj1[mtype].w);
+
+          // charge-charge
+          if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) { 
+            r3inv = r2inv*rinv;
+            pre1 = qtmp*qj*rinv*(r2inv-rcutcoul2inv);
+
+            forcecoul.x += pre1*delx;
+            forcecoul.y += pre1*dely;
+            forcecoul.z += pre1*delz;
+          }                    
+
+          // dipole-dipole
+          if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
+            r3inv = r2inv*rinv; 
+            r5inv = r3inv*r2inv;
+	          
+            pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
+            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
+            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
+            
+            afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv;
+            pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr);
+            aforcecoul.x = pre1*delx;
+            aforcecoul.y = pre1*dely;
+            aforcecoul.z = pre1*delz;
+	    
+            bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+
+              (numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv;
+            presf = (numtyp)2.0*r2inv*pidotr*pjdotr;
+            bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx);
+            bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely);
+            bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz);
+	    
+            forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x);
+            forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y);
+            forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z);
+            
+            pre2 = (numtyp)3.0*bfac*r5inv*pjdotr;
+            pre4 = -bfac*r3inv;
+
+            numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
+            numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
+            numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
+
+            ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
+            ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
+            ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
+          }
+
+          // dipole-charge
+          if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
+            r3inv = r2inv*rinv; 
+            r5inv = r3inv*r2inv;
+            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
+            pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv);
+            pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + 
+              (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
+            pre2 = qj*r3inv * pqfac;
+
+            forcecoul.x += pre2*mui.x - pre1*delx;
+            forcecoul.y += pre2*mui.y - pre1*dely;
+            forcecoul.z += pre2*mui.z - pre1*delz;
+            ticoul.x += pre2 * (mui.y*delz - mui.z*dely);
+            ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
+            ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
+          }
+          
+          // charge-dipole
+          if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
+            r3inv = r2inv*rinv;
+            r5inv = r3inv*r2inv;
+            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
+
+            pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv);
+            qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + 
+              (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
+            pre2 = qtmp*r3inv * qpfac;
+            
+            forcecoul.x += pre1*delx - pre2*muj.x;
+            forcecoul.y += pre1*dely - pre2*muj.y;
+            forcecoul.z += pre1*delz - pre2*muj.z;
+          }
+        } else {
+          forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
+          ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
+        }
+
+        numtyp fq = factor_coul*qqrd2e;
+        force.x = fq*forcecoul.x + delx*force_lj;
+        force.y = fq*forcecoul.y + dely*force_lj;
+        force.z = fq*forcecoul.z + delz*force_lj;
+        f.x+=force.x;
+        f.y+=force.y;
+        f.z+=force.z;
+        tor.x+=fq*ticoul.x;
+        tor.y+=fq*ticoul.y;
+        tor.z+=fq*ticoul.z;
+
+        if (eflag>0) {
+          acctyp e = (acctyp)0.0;  
+          if (rsq < lj1[mtype].w) {
+            numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv);
+            e = qtmp*qj*rinv*fac*fac;
+            if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
+              e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr);
+            if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) 
+              e += -qj*r3inv*pidotr * pqfac;
+            if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
+              e += qtmp*r3inv*pjdotr * qpfac;
+            e *= fq;
+          } else e = (acctyp)0.0;
+          e_coul += e;
+
+          if (rsq < lj1[mtype].z) {
+            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) +
+              rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - 
+              (numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv +
+              rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + 
+              (numtyp)4.0*lj3[mtype].y);
+            energy+=factor_lj*e;
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*force.x;
+          virial[1] += dely*force.y;
+          virial[2] += delz*force.z;
+          virial[3] += delx*force.y;
+          virial[4] += delx*force.z;
+          virial[5] += dely*force.z;
+        }
+      }
+
+    } // for nbor
+    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
diff --git a/lib/gpu/lal_dipole_lj_sf.h b/lib/gpu/lal_dipole_lj_sf.h
new file mode 100644
index 0000000000..83cea4c2a4
--- /dev/null
+++ b/lib/gpu/lal_dipole_lj_sf.h
@@ -0,0 +1,85 @@
+/***************************************************************************
+                                  dipole_lj_sf.h
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the dipole/sf pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_DIPOLE_LJ_SF_H
+#define LAL_DIPOLE_LJ_SF_H
+
+#include "lal_base_dipole.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class DipoleLJSF : public BaseDipole<numtyp, acctyp> {
+ public:
+  DipoleLJSF();
+  ~DipoleLJSF();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           double **host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = cutsq
+  UCL_D_Vec<numtyp4> lj3;
+  /// cutsq
+  UCL_D_Vec<numtyp> cutsq;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+  numtyp _qqrd2e;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_dipole_lj_sf_ext.cpp b/lib/gpu/lal_dipole_lj_sf_ext.cpp
new file mode 100644
index 0000000000..53ef66fca6
--- /dev/null
+++ b/lib/gpu/lal_dipole_lj_sf_ext.cpp
@@ -0,0 +1,128 @@
+/***************************************************************************
+                               dipole_lj_sf_ext.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Functions for LAMMPS access to dipole/sf acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_dipole_lj_sf.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static DipoleLJSF<PRECISION,ACC_PRECISION> DPLSFMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                   double **host_lj2, double **host_lj3, double **host_lj4,
+                   double *special_lj, const int inum,
+                   const int nall, const int max_nbors, const int maxspecial,
+                   const double cell_size, int &gpu_mode, FILE *screen,
+                   double **host_cut_ljsq, double **host_cut_coulsq,
+                   double *host_special_coul, const double qqrd2e) {
+  DPLSFMF.clear();
+  gpu_mode=DPLSFMF.device->gpu_mode();
+  double gpu_split=DPLSFMF.device->particle_split();
+  int first_gpu=DPLSFMF.device->first_device();
+  int last_gpu=DPLSFMF.device->last_device();
+  int world_me=DPLSFMF.device->world_me();
+  int gpu_rank=DPLSFMF.device->gpu_rank();
+  int procs_per_gpu=DPLSFMF.device->procs_per_gpu();
+
+  DPLSFMF.device->init_message(screen,"dipole/sf",first_gpu,last_gpu);
+
+  bool message=false;
+  if (DPLSFMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                         host_lj4, special_lj, inum, nall, 300,
+                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                         host_cut_coulsq, host_special_coul, qqrd2e);
+
+  DPLSFMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                           special_lj, inum, nall, 300, maxspecial,
+                           cell_size, gpu_split, screen, host_cut_ljsq,
+                           host_cut_coulsq, host_special_coul, qqrd2e);
+
+    DPLSFMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    DPLSFMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void dplsf_gpu_clear() {
+  DPLSFMF.clear();
+}
+
+int** dplsf_gpu_compute_n(const int ago, const int inum_full,
+                          const int nall, double **host_x, int *host_type,
+                          double *sublo, double *subhi, int *tag, int **nspecial, 
+                          int **special, const bool eflag, const bool vflag,
+                          const bool eatom, const bool vatom, int &host_start,
+                          int **ilist, int **jnum, const double cpu_time,
+                          bool &success, double *host_q, double **host_mu, 
+                          double *boxlo, double *prd) {
+  return DPLSFMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                         subhi, tag, nspecial, special, eflag, vflag, eatom,
+                         vatom, host_start, ilist, jnum, cpu_time, success,
+                         host_q, host_mu, boxlo, prd);
+}  
+			
+void dplsf_gpu_compute(const int ago, const int inum_full, const int nall,
+                       double **host_x, int *host_type, int *ilist, int *numj,
+                       int **firstneigh, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       const double cpu_time, bool &success, double *host_q,
+                       double **host_mu, const int nlocal, double *boxlo, double *prd) {
+  DPLSFMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
+                  vflag,eatom,vatom,host_start,cpu_time,success,host_q,host_mu,
+                  nlocal,boxlo,prd);
+}
+
+double dplsf_gpu_bytes() {
+  return DPLSFMF.host_memory_usage();
+}
+
+
diff --git a/lib/gpu/lal_gauss.cpp b/lib/gpu/lal_gauss.cpp
new file mode 100644
index 0000000000..d05573f751
--- /dev/null
+++ b/lib/gpu/lal_gauss.cpp
@@ -0,0 +1,146 @@
+/***************************************************************************
+                                   gauss.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the gauss pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "gauss_cl.h"
+#elif defined(USE_CUDART)
+const char *gauss=0;
+#else
+#include "gauss_cubin.h"
+#endif
+
+#include "lal_gauss.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define GaussT Gauss<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+GaussT::Gauss() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+GaussT::~Gauss() { 
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int GaussT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int GaussT::init(const int ntypes, 
+                 double **host_cutsq, double **host_a, 
+                 double **host_b, double **host_offset, 
+                 double *host_special_lj, const int nlocal,
+                 const int nall, const int max_nbors,
+                 const int maxspecial, const double cell_size,
+                 const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,gauss,"k_gauss");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  gauss1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,gauss1,host_write,host_a,host_b,
+			                   host_cutsq,host_offset);
+
+  UCL_H_Vec<double> dview;
+  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  dview.view(host_special_lj,4,*(this->ucl_device));
+  ucl_copy(sp_lj,dview,false);
+
+  _allocated=true;
+  this->_max_bytes=gauss1.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void GaussT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  gauss1.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double GaussT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(Gauss<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void GaussT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->x, &gauss1, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &gauss1, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class Gauss<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_gauss.cu b/lib/gpu/lal_gauss.cu
new file mode 100644
index 0000000000..b078d17c3b
--- /dev/null
+++ b/lib/gpu/lal_gauss.cu
@@ -0,0 +1,189 @@
+// **************************************************************************
+//                                   gauss.cu
+//                             -------------------
+//                           Trung Dac Nguyen (ORNL)
+//
+//  Device code for acceleration of the gauss pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : nguyentd@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+#else
+texture<int4,1> pos_tex;
+#endif
+#else
+#define pos_tex x_
+#endif
+
+__kernel void k_gauss(__global numtyp4 *x_, __global numtyp4 *gauss1,
+                      const int lj_types, 
+                      __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                      __global int *dev_packed, __global acctyp4 *ans,
+                      __global acctyp *engv, const int eflag, 
+                      const int vflag, const int inum,
+                      const int nbor_pitch, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+        
+      int mtype=itype*lj_types+jtype;
+      if (rsq<gauss1[mtype].z) {
+        numtyp r2inv = ucl_recip(rsq);
+        numtyp r = ucl_sqrt(rsq);
+        numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq* 
+        ucl_exp(-gauss1[mtype].y*rsq)*r2inv*factor_lj; 
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - 
+            gauss1[mtype].w);
+          energy+=factor_lj*e; 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
+__kernel void k_gauss_fast(__global numtyp4 *x_, __global numtyp4 *gauss1_in,
+                           __global numtyp* sp_lj_in, 
+                           __global int *dev_nbor, __global int *dev_packed, 
+                           __global acctyp4 *ans, __global acctyp *engv, 
+                           const int eflag, const int vflag, const int inum, 
+                           const int nbor_pitch, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+  
+  __local numtyp4 gauss1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[4];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    gauss1[tid]=gauss1_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+        
+      if (rsq<gauss1[mtype].z) {
+        numtyp r2inv = ucl_recip(rsq);
+        numtyp r = ucl_sqrt(rsq);
+        numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq* 
+        ucl_exp(-gauss1[mtype].y*rsq)*r2inv*factor_lj; 
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - 
+            gauss1[mtype].w);
+          energy+=factor_lj*e; 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
diff --git a/lib/gpu/lal_gauss.h b/lib/gpu/lal_gauss.h
new file mode 100644
index 0000000000..fad790899d
--- /dev/null
+++ b/lib/gpu/lal_gauss.h
@@ -0,0 +1,77 @@
+/***************************************************************************
+                                    gauss.h
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the gauss pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_GAUSS_H
+#define LAL_GAYSS_H
+
+#include "lal_base_atomic.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class Gauss : public BaseAtomic<numtyp, acctyp> {
+ public:
+  Gauss();
+  ~Gauss(); 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_a, double **host_b, double **host_offset, 
+           double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// gauss1.x = a, gauss1.y = b, gauss1.z = cutsq, gauss1.w = offset
+  UCL_D_Vec<numtyp4> gauss1;
+  /// Special LJ values
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_gauss_ext.cpp b/lib/gpu/lal_gauss_ext.cpp
new file mode 100644
index 0000000000..4d95c7cfb9
--- /dev/null
+++ b/lib/gpu/lal_gauss_ext.cpp
@@ -0,0 +1,120 @@
+/***************************************************************************
+                                 gauss_ext.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Functions for LAMMPS access to gauss acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_gauss.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static Gauss<PRECISION,ACC_PRECISION> GLMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, 
+                   double **host_b, double **offset, double *special_lj, 
+                   const int inum, const int nall, const int max_nbors,  
+                   const int maxspecial,
+                   const double cell_size, int &gpu_mode, FILE *screen) {
+  GLMF.clear();
+  gpu_mode=GLMF.device->gpu_mode();
+  double gpu_split=GLMF.device->particle_split();
+  int first_gpu=GLMF.device->first_device();
+  int last_gpu=GLMF.device->last_device();
+  int world_me=GLMF.device->world_me();
+  int gpu_rank=GLMF.device->gpu_rank();
+  int procs_per_gpu=GLMF.device->procs_per_gpu();
+
+  GLMF.device->init_message(screen,"gauss",first_gpu,last_gpu);
+
+  bool message=false;
+  if (GLMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=GLMF.init(ntypes, cutsq, host_a, host_b, 
+                       offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen);
+
+  GLMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=GLMF.init(ntypes, cutsq, host_a, host_b,
+                        offset, special_lj, inum, nall, 300, maxspecial,
+                        cell_size, gpu_split, screen);
+
+    GLMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    GLMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void gauss_gpu_clear() {
+  GLMF.clear();
+}
+
+int ** gauss_gpu_compute_n(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           double *sublo, double *subhi, int *tag, int **nspecial,
+                           int **special, const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success) {
+  return GLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                      subhi, tag, nspecial, special, eflag, vflag, eatom,
+                      vatom, host_start, ilist, jnum, cpu_time, success);
+}  
+			
+void gauss_gpu_compute(const int ago, const int inum_full, const int nall,
+                       double **host_x, int *host_type, int *ilist, int *numj,
+                       int **firstneigh, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       const double cpu_time, bool &success) {
+  GLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+               firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
+}
+
+double gauss_gpu_bytes() {
+  return GLMF.host_memory_usage();
+}
+
+
diff --git a/lib/gpu/lal_lj_coul_debye.cpp b/lib/gpu/lal_lj_coul_debye.cpp
new file mode 100644
index 0000000000..54f5fe7ce0
--- /dev/null
+++ b/lib/gpu/lal_lj_coul_debye.cpp
@@ -0,0 +1,168 @@
+/***************************************************************************
+                              lj_coul_debye.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the lj/cut/coul/debye pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "lj_coul_debye_cl.h"
+#elif defined(USE_CUDART)
+const char *lj_coul_debye=0;
+#else
+#include "lj_coul_debye_cubin.h"
+#endif
+
+#include "lal_lj_coul_debye.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define LJCoulDebyeT LJCoulDebye<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+LJCoulDebyeT::LJCoulDebye() : BaseCharge<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+LJCoulDebyeT::~LJCoulDebye() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int LJCoulDebyeT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int LJCoulDebyeT::init(const int ntypes,
+                       double **host_cutsq, double **host_lj1, 
+                       double **host_lj2, double **host_lj3, 
+                       double **host_lj4, double **host_offset, 
+                       double *host_special_lj, const int nlocal,
+                       const int nall, const int max_nbors,
+                       const int maxspecial, const double cell_size,
+                       const double gpu_split, FILE *_screen,
+                       double **host_cut_ljsq, double **host_cut_coulsq,
+                       double *host_special_coul, const double qqrd2e,
+                       const double kappa) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj_coul_debye,"k_lj_debye");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+			                   host_cut_ljsq, host_cut_coulsq);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+		                     host_offset);
+
+  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
+
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _qqrd2e=qqrd2e;
+  _kappa=kappa;
+  
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
+                   sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void LJCoulDebyeT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  cutsq.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double LJCoulDebyeT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(LJCoulDebye<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->atom->q, &cutsq,
+                          &_qqrd2e, &_kappa, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->atom->q, &cutsq, 
+                     &_qqrd2e, &_kappa, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class LJCoulDebye<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_coul_debye.cu b/lib/gpu/lal_lj_coul_debye.cu
new file mode 100644
index 0000000000..e63d7a40e3
--- /dev/null
+++ b/lib/gpu/lal_lj_coul_debye.cu
@@ -0,0 +1,256 @@
+// **************************************************************************
+//                              lj_coul_debye.cu
+//                             -------------------
+//                           Trung Dac Nguyen (ORNL)
+//
+//  Device code for acceleration of the lj/cut/coul/debye pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : nguyentd@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_lj_debye_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum,
+                          const int nbor_pitch, __global numtyp *q_ ,
+                          __global numtyp *cutsq, const numtyp qqrd2e,
+                          const numtyp kappa,
+                          const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int itype=ix.w;
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+      
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<cutsq[mtype]) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, r6inv, r, rinv, screening;
+
+        if (rsq < lj1[mtype].z) {
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < lj1[mtype].w) {
+          r = ucl_sqrt(rsq);
+          rinv = ucl_recip(r);
+          fetch(screening,j,q_tex);
+          screening *= ucl_exp(-kappa*r);
+          forcecoul = qqrd2e*qtmp*(kappa+rinv)*screening*factor_coul;
+        }
+        else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < lj1[mtype].z) {
+            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          }
+          if (rsq < lj1[mtype].w) {
+            e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul;
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_lj_debye_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nbor_pitch, __global numtyp *q_,
+                               __global numtyp *_cutsq, const numtyp qqrd2e,
+                               const numtyp kappa,
+                               const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    cutsq[tid]=_cutsq[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq[mtype]) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, r6inv, r, rinv, screening;
+
+        if (rsq < lj1[mtype].z) {
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < lj1[mtype].w) {
+          r = ucl_sqrt(rsq);
+          rinv = ucl_recip(r);
+          fetch(screening,j,q_tex);
+          screening *= ucl_exp(-kappa*r);
+          forcecoul = qqrd2e*qtmp*(kappa+rinv)*screening*factor_coul;
+        }
+        else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < lj1[mtype].z) {
+            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          }
+          if (rsq < lj1[mtype].w) {
+            e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul;
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
diff --git a/lib/gpu/lal_lj_coul_debye.h b/lib/gpu/lal_lj_coul_debye.h
new file mode 100644
index 0000000000..73e38c647d
--- /dev/null
+++ b/lib/gpu/lal_lj_coul_debye.h
@@ -0,0 +1,85 @@
+/***************************************************************************
+                               lj_coul_debye.h
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the lj/cut/coul/debye pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_LJ_COUL_DEBYE_H
+#define LAL_LJ_COUL_DEBYE_H
+
+#include "lal_base_charge.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class LJCoulDebye : public BaseCharge<numtyp, acctyp> {
+ public:
+  LJCoulDebye();
+  ~LJCoulDebye();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           double **host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double kappa);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// cutsq
+  UCL_D_Vec<numtyp> cutsq;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+  numtyp _qqrd2e,_kappa;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_lj_coul_debye_ext.cpp b/lib/gpu/lal_lj_coul_debye_ext.cpp
new file mode 100644
index 0000000000..3e5aab4d9d
--- /dev/null
+++ b/lib/gpu/lal_lj_coul_debye_ext.cpp
@@ -0,0 +1,129 @@
+/***************************************************************************
+                             lj_coul_debye_ext.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Functions for LAMMPS access to lj/cut/coul/debye acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_lj_coul_debye.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static LJCoulDebye<PRECISION,ACC_PRECISION> LJCDMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double **host_cut_ljsq, double **host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e, 
+                  const double kappa) {
+  LJCDMF.clear();
+  gpu_mode=LJCDMF.device->gpu_mode();
+  double gpu_split=LJCDMF.device->particle_split();
+  int first_gpu=LJCDMF.device->first_device();
+  int last_gpu=LJCDMF.device->last_device();
+  int world_me=LJCDMF.device->world_me();
+  int gpu_rank=LJCDMF.device->gpu_rank();
+  int procs_per_gpu=LJCDMF.device->procs_per_gpu();
+
+  LJCDMF.device->init_message(screen,"lj/cut/coul/debye",first_gpu,last_gpu);
+
+  bool message=false;
+  if (LJCDMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                        host_lj4, offset, special_lj, inum, nall, 300,
+                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                        host_cut_coulsq, host_special_coul, qqrd2e, kappa);
+
+  LJCDMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                          offset, special_lj, inum, nall, 300, maxspecial,
+                          cell_size, gpu_split, screen, host_cut_ljsq,
+                          host_cut_coulsq, host_special_coul, qqrd2e, kappa);
+
+    LJCDMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    LJCDMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void ljcd_gpu_clear() {
+  LJCDMF.clear();
+}
+
+int** ljcd_gpu_compute_n(const int ago, const int inum_full,
+                         const int nall, double **host_x, int *host_type,
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
+                         int **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
+                         int **ilist, int **jnum, const double cpu_time,
+                         bool &success, double *host_q, double *boxlo,
+                         double *prd) {
+  return LJCDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_q, boxlo, prd);
+}  
+			
+void ljcd_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success, double *host_q,
+                     const int nlocal, double *boxlo, double *prd) {
+  LJCDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
+                vflag,eatom,vatom,host_start,cpu_time,success,host_q,
+                nlocal,boxlo,prd);
+}
+
+double ljcd_gpu_bytes() {
+  return LJCDMF.host_memory_usage();
+}
+
+
diff --git a/lib/gpu/lal_lj_dsf.cpp b/lib/gpu/lal_lj_dsf.cpp
new file mode 100644
index 0000000000..fe13808d28
--- /dev/null
+++ b/lib/gpu/lal_lj_dsf.cpp
@@ -0,0 +1,168 @@
+/***************************************************************************
+                                  lj_dsf.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the lj/cut/coul/dsf pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 7/12/2012
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "lj_dsf_cl.h"
+#elif defined(USE_CUDART)
+const char *lj_dsf=0;
+#else
+#include "lj_dsf_cubin.h"
+#endif
+
+#include "lal_lj_dsf.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define LJDSFT LJDSF<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+LJDSFT::LJDSF() : BaseCharge<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+LJDSFT::~LJDSF() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int LJDSFT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1, 
+                 double **host_lj2, double **host_lj3,  double **host_lj4,
+                 double **host_offset,  double *host_special_lj, 
+                 const int nlocal, const int nall, const int max_nbors,
+                 const int maxspecial, const double cell_size, 
+                 const double gpu_split, FILE *_screen,
+                 double **host_cut_ljsq, const double host_cut_coulsq,
+                 double *host_special_coul, const double qqrd2e,
+                 const double e_shift, const double f_shift, 
+                 const double alpha) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj_dsf,"k_lj_dsf");
+  if (success!=0)
+    return success;
+
+  _cut_coulsq=host_cut_coulsq;
+  _e_shift=e_shift;
+  _f_shift=f_shift;
+  _alpha=alpha;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+			 host_cut_ljsq, host_cutsq);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+		         host_offset);
+
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _qqrd2e=qqrd2e;
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void LJDSFT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double LJDSFT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(LJDSF<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void LJDSFT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, 
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                          &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
+                          &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, 
+                     &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                     &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
+                     &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class LJDSF<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_dsf.cu b/lib/gpu/lal_lj_dsf.cu
new file mode 100644
index 0000000000..9073d72884
--- /dev/null
+++ b/lib/gpu/lal_lj_dsf.cu
@@ -0,0 +1,261 @@
+// **************************************************************************
+//                                lj_dsf.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for acceleration of the lj/cut/coul/dsf pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 7/12/2012
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+#define MY_PIS (acctyp)1.77245385090551602729
+
+__kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
+                       __global numtyp4* lj3, const int lj_types, 
+                       __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                       __global int *dev_packed, __global acctyp4 *ans,
+                       __global acctyp *engv, const int eflag,
+                       const int vflag, const int inum,
+                       const int nbor_pitch, __global numtyp *q_ ,
+                       const numtyp cut_coulsq, const numtyp qqrd2e,
+                       const numtyp e_shift, const numtyp f_shift, 
+                       const numtyp alpha, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int itype=ix.w;
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul, r, prefactor, erfcc;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<lj1[mtype].w) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, r6inv;
+
+        if (rsq < lj1[mtype].z) {
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          r = ucl_sqrt(rsq);
+          fetch(prefactor,j,q_tex);
+          prefactor *= factor_coul * qqrd2e*qtmp/r;
+          numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
+          erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
+          forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd + 
+                                   rsq*f_shift);
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq) {
+            numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift);
+            e_coul += e;
+          }
+          if (rsq < lj1[mtype].z) {
+            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                            __global numtyp4* lj3_in, __global numtyp* sp_lj_in,
+                            __global int *dev_nbor, __global int *dev_packed,
+                            __global acctyp4 *ans, __global acctyp *engv, 
+                            const int eflag, const int vflag, const int inum, 
+                            const int nbor_pitch, __global numtyp *q_,
+                            const numtyp cut_coulsq, const numtyp qqrd2e,
+                            const numtyp e_shift, const numtyp f_shift, 
+                            const numtyp alpha, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<list_end; nbor+=n_stride) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul, r, prefactor, erfcc;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<lj1[mtype].w) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, r6inv;
+
+        if (rsq < lj1[mtype].z) {
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          r = ucl_sqrt(rsq);
+          fetch(prefactor,j,q_tex);
+          prefactor *= factor_coul * qqrd2e*qtmp/r;
+          numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
+          erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
+          forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd + 
+                                   rsq*f_shift);
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq) {
+            numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift);
+            e_coul += e;
+          }
+          if (rsq < lj1[mtype].z) {
+            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
diff --git a/lib/gpu/lal_lj_dsf.h b/lib/gpu/lal_lj_dsf.h
new file mode 100644
index 0000000000..5badf543c4
--- /dev/null
+++ b/lib/gpu/lal_lj_dsf.h
@@ -0,0 +1,85 @@
+/***************************************************************************
+                                  lj_dsf.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the lj/cut/coul/dsf pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 7/12/2012
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_LJ_DSF_H
+#define LAL_LJ_DSF_H
+
+#include "lal_base_charge.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class LJDSF : public BaseCharge<numtyp, acctyp> {
+ public:
+  LJDSF();
+  ~LJDSF();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double e_shift, const double f_shift, 
+           const double alpha);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+  numtyp _qqrd2e;
+
+ private:
+  bool _allocated;
+  numtyp _e_shift, _f_shift, _alpha, _cut_coulsq;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_lj_dsf_ext.cpp b/lib/gpu/lal_lj_dsf_ext.cpp
new file mode 100644
index 0000000000..52b56f38c2
--- /dev/null
+++ b/lib/gpu/lal_lj_dsf_ext.cpp
@@ -0,0 +1,132 @@
+/***************************************************************************
+                               lj_dsf_ext.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Functions for LAMMPS access to lj/cut/coul/dsf acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 7/12/2012
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_lj_dsf.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static LJDSF<PRECISION,ACC_PRECISION> LJDMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4,
+                 double **offset, double *special_lj, const int inum,
+                 const int nall, const int max_nbors, const int maxspecial,
+                 const double cell_size, int &gpu_mode, FILE *screen,
+                 double **host_cut_ljsq, const double host_cut_coulsq,
+                 double *host_special_coul, const double qqrd2e,
+                 const double e_shift, const double f_shift, 
+                 const double alpha) {
+  LJDMF.clear();
+  gpu_mode=LJDMF.device->gpu_mode();
+  double gpu_split=LJDMF.device->particle_split();
+  int first_gpu=LJDMF.device->first_device();
+  int last_gpu=LJDMF.device->last_device();
+  int world_me=LJDMF.device->world_me();
+  int gpu_rank=LJDMF.device->gpu_rank();
+  int procs_per_gpu=LJDMF.device->procs_per_gpu();
+
+  LJDMF.device->init_message(screen,"lj/cut/coul/dsf",first_gpu,last_gpu);
+
+  bool message=false;
+  if (LJDMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                       host_cut_coulsq, host_special_coul, qqrd2e, e_shift,
+                       f_shift, alpha);
+
+  LJDMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split, screen, host_cut_ljsq,
+                         host_cut_coulsq, host_special_coul, qqrd2e, e_shift,
+                         f_shift, alpha);
+
+    LJDMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    LJDMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void ljd_gpu_clear() {
+  LJDMF.clear();
+}
+
+int** ljd_gpu_compute_n(const int ago, const int inum_full,
+                        const int nall, double **host_x, int *host_type,
+                        double *sublo, double *subhi, int *tag, int **nspecial, 
+                        int **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success, double *host_q, double *boxlo,
+                        double *prd) {
+  return LJDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, ilist, jnum, cpu_time, success,
+                       host_q, boxlo, prd);
+}  
+			
+void ljd_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success, double *host_q,
+                     const int nlocal, double *boxlo, double *prd) {
+  LJDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
+                vflag,eatom,vatom,host_start,cpu_time,success,host_q,
+                nlocal,boxlo,prd);
+}
+
+double ljd_gpu_bytes() {
+  return LJDMF.host_memory_usage();
+}
+
+
diff --git a/lib/gpu/lal_yukawa_colloid.cpp b/lib/gpu/lal_yukawa_colloid.cpp
new file mode 100644
index 0000000000..ee0571fd3e
--- /dev/null
+++ b/lib/gpu/lal_yukawa_colloid.cpp
@@ -0,0 +1,315 @@
+/***************************************************************************
+                              yukawa_colloid.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the yukawa/colloid pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "yukawa_colloid_cl.h"
+#elif defined(USE_CUDART)
+const char *yukawa_colloid=0;
+#else
+#include "yukawa_colloid_cubin.h"
+#endif
+
+#include "lal_yukawa_colloid.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define YukawaColloidT YukawaColloid<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+YukawaColloidT::YukawaColloid() : BaseAtomic<numtyp,acctyp>(), 
+_allocated(false), _max_rad_size(0) {
+}
+
+template <class numtyp, class acctyp>
+YukawaColloidT::~YukawaColloid() { 
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int YukawaColloidT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int YukawaColloidT::init(const int ntypes, 
+                   double **host_cutsq, double **host_a, 
+                   double **host_offset, double *host_special_lj, const int nlocal,
+                   const int nall, const int max_nbors,
+                   const int maxspecial, const double cell_size,
+                   const double gpu_split, FILE *_screen, const double kappa) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,yukawa_colloid,"k_yukawa_colloid");
+  if (success!=0)
+    return success;
+
+  // allocate rad
+  
+  bool cpuview=false;
+  if (this->ucl_device->device_type()==UCL_CPU)
+    cpuview=true;
+  
+  int ef_nall=nall;
+  if (ef_nall==0)
+    ef_nall=2000;
+  
+  _max_rad_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
+  host_rad.alloc(_max_rad_size,*(this->ucl_device));
+  if (cpuview)
+    dev_rad.view(host_rad);
+  else 
+    dev_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY);
+  
+  rad_tex.get_texture(*(this->pair_program),"rad_tex");
+  rad_tex.bind_float(dev_rad,1);
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  _kappa = kappa;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types*32; i++)
+    host_write[i]=(numtyp)0.0;
+
+  coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,
+			 host_offset,host_cutsq);
+
+  UCL_H_Vec<double> dview;
+  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  dview.view(host_special_lj,4,*(this->ucl_device));
+  ucl_copy(sp_lj,dview,false);
+  
+  _allocated=true;
+  this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void YukawaColloidT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  coeff.clear();
+  sp_lj.clear();
+
+  host_rad.clear();
+  dev_rad.clear();
+
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double YukawaColloidT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(YukawaColloid<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Copy nbor list from host if necessary and then compute atom energies/forces
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void YukawaColloidT::compute(const int f_ago, const int inum_full, 
+               const int nall, double **host_x, int *host_type, int *ilist, 
+               int *numj, int **firstneigh, const bool eflag, const bool vflag,
+               const bool eatom, const bool vatom, int &host_start,
+               const double cpu_time, bool &success, double *rad) {
+  this->acc_timers();
+  
+  // ------------------- Resize rad array --------------------------
+  
+  if (nall>_max_rad_size) {
+    dev_rad.clear();
+    host_rad.clear();
+    
+    _max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    host_rad.alloc(_max_rad_size,*(this->ucl_device));
+
+    if (this->ucl_device->device_type()==UCL_CPU) {
+      if (sizeof(numtyp)==sizeof(double)) {
+        host_rad.view((numtyp*)rad,nall,*(this->ucl_device));
+        dev_rad.view(host_rad);
+      } 
+    } else {
+      dev_rad.alloc(_max_rad_size,*(this->ucl_device));
+    }
+    
+    rad_tex.bind_float(dev_rad,1);
+  }
+
+  // ----------------------------------------------------------------
+
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    this->resize_atom(0,nall,success);
+    this->zero_timers();
+    return;
+  }
+  
+  int ago=this->hd_balancer.ago_first(f_ago);
+  int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
+  this->ans->inum(inum);
+  host_start=inum;
+
+  // -----------------------------------------------------------------
+
+  if (ago==0) {
+    this->reset_nbors(nall, inum, ilist, numj, firstneigh, success);
+    if (!success)
+      return;
+  }
+  
+  this->atom->cast_x_data(host_x,host_type);
+  this->cast_rad_data(rad);
+  this->hd_balancer.start_timer();
+  this->atom->add_x_data(host_x,host_type);
+  this->add_rad_data();
+
+  this->loop(eflag,vflag);
+  this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  this->device->add_ans_object(this->ans);
+  this->hd_balancer.stop_timer();
+}
+               
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU and then compute per-atom densities
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, double *sublo,
+                double *subhi, int *tag, int **nspecial,
+                int **special, const bool eflag, const bool vflag, 
+                const bool eatom, const bool vatom, int &host_start, 
+                int **ilist, int **jnum, const double cpu_time, bool &success,
+                double *rad) {
+  this->acc_timers();
+  
+  // ------------------- Resize rad array ----------------------------
+  
+  if (nall>_max_rad_size) {
+    dev_rad.clear();
+    host_rad.clear();
+    
+    _max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    host_rad.alloc(_max_rad_size,*(this->ucl_device));
+
+    if (this->ucl_device->device_type()==UCL_CPU) {
+      if (sizeof(numtyp)==sizeof(double)) {
+        host_rad.view((numtyp*)rad,nall,*(this->ucl_device));
+        dev_rad.view(host_rad);
+      } 
+    } else {
+      dev_rad.alloc(_max_rad_size,*(this->ucl_device));
+    }
+    
+    rad_tex.bind_float(dev_rad,1);
+  }      
+
+  // -----------------------------------------------------------------
+  
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    this->resize_atom(0,nall,success);
+    this->zero_timers();
+    return NULL;
+  }
+  
+  // load balance, returning the atom count on the device (inum)
+  this->hd_balancer.balance(cpu_time);
+  int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
+  this->ans->inum(inum);
+  host_start=inum;
+ 
+  // Build neighbor list on GPU if necessary 
+  if (ago==0) {
+    this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                          sublo, subhi, tag, nspecial, special, success);
+    if (!success)
+      return NULL;
+    this->cast_rad_data(rad);
+    this->hd_balancer.start_timer();  
+  } else {
+    this->atom->cast_x_data(host_x,host_type);
+    this->cast_rad_data(rad);
+    this->hd_balancer.start_timer();
+    this->atom->add_x_data(host_x,host_type);
+  }
+  this->add_rad_data();
+  *ilist=this->nbor->host_ilist.begin();
+  *jnum=this->nbor->host_acc.begin();
+
+  this->loop(eflag,vflag);
+  this->ans->copy_answers(eflag,vflag,eatom,vatom);
+  this->device->add_ans_object(this->ans);
+  this->hd_balancer.stop_timer();
+
+  return this->nbor->host_jlist.begin()-host_start;
+}
+
+// ---------------------------------------------------------------------------
+// Calculate per-atom energies and forces
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void YukawaColloidT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->x, &dev_rad, &coeff, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &dev_rad, &coeff, &_lj_types, &sp_lj, 
+                     &this->nbor->dev_nbor, &this->_nbor_data->begin(), 
+                     &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                     &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
+  }
+  this->time_pair.stop();
+}
+
+template class YukawaColloid<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_yukawa_colloid.cu b/lib/gpu/lal_yukawa_colloid.cu
new file mode 100644
index 0000000000..4eb9bc87b2
--- /dev/null
+++ b/lib/gpu/lal_yukawa_colloid.cu
@@ -0,0 +1,202 @@
+// **************************************************************************
+//                              yukawa_colloid.cu
+//                             -------------------
+//                           Trung Dac Nguyen (ORNL)
+//
+//  Device code for acceleration of the yukawa/colloid pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : 
+//    email                : nguyentd@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> rad_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> rad_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define rad_tex rad_
+#endif
+
+__kernel void k_yukawa_colloid(__global numtyp4 *x_, __global numtyp *rad_,
+                               __global numtyp4 *coeff, const int lj_types, 
+                               __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                               __global int *dev_packed, __global acctyp4 *ans,
+                               __global acctyp *engv, const int eflag, 
+                               const int vflag, const int inum,
+                               const int nbor_pitch, const int t_per_atom,
+                               const numtyp kappa) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+  
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp radi; fetch(radi,i,rad_tex);
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      numtyp radj; fetch(radj,j,rad_tex);
+      int jtype=jx.w;
+  
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+        
+      int mtype=itype*lj_types+jtype;
+      if (rsq<coeff[mtype].z) {   
+        numtyp r = ucl_sqrt(rsq);
+        numtyp rinv = ucl_recip(r);
+	      numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
+	      numtyp force = coeff[mtype].x * screening;
+
+	      force = factor_lj*force * rinv;
+  
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=coeff[mtype].x/kappa * screening;
+          energy+=factor_lj*(e-coeff[mtype].y); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
+__kernel void k_yukawa_colloid_fast(__global numtyp4 *x_, __global numtyp *rad_,
+                                    __global numtyp4 *coeff_in, __global numtyp *sp_lj_in,
+                                    __global int *dev_nbor, __global int *dev_packed, 
+                                    __global acctyp4 *ans, __global acctyp *engv, 
+                                    const int eflag, const int vflag, const int inum, 
+                                    const int nbor_pitch, const int t_per_atom,
+                                    const numtyp kappa) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+  
+  __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[4];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    coeff[tid]=coeff_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor, *list_end;
+    int i, numj, n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,list_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp radi; fetch(radi,i,rad_tex);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      numtyp radj; fetch(radj,j,rad_tex);
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+        
+      if (rsq<coeff[mtype].z) {
+        numtyp r = ucl_sqrt(rsq);
+        numtyp rinv = ucl_recip(r);
+	      numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
+	      numtyp force = coeff[mtype].x * screening;
+
+	      force = factor_lj*force * rinv;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=coeff[mtype].x/kappa * screening;
+          energy+=factor_lj*(e-coeff[mtype].y); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv);
+  } // if ii
+}
+
diff --git a/lib/gpu/lal_yukawa_colloid.h b/lib/gpu/lal_yukawa_colloid.h
new file mode 100644
index 0000000000..cd3cbb5cab
--- /dev/null
+++ b/lib/gpu/lal_yukawa_colloid.h
@@ -0,0 +1,127 @@
+/***************************************************************************
+                              yukawa_colloid.h
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the yukawa/colloid pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_YUKAWA_COLLOID_H
+#define LAL_YUKAWA_COLLOID_H
+
+#include "lal_base_atomic.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class YukawaColloid : public BaseAtomic<numtyp, acctyp> {
+ public:
+  YukawaColloid();
+  ~YukawaColloid(); 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_a, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, const double kappa);
+
+  inline void cast_rad_data(double* rad) {
+    int nall = this->atom->nall();
+    if (this->ucl_device->device_type()==UCL_CPU) {
+      if (sizeof(numtyp)==sizeof(double)) {
+        host_rad.view((numtyp*)rad,nall,*(this->ucl_device));
+        dev_rad.view(host_rad);
+      } else {
+        for (int i=0; i<nall; i++) host_rad[i]=rad[i];
+      }
+    } else {
+      if (sizeof(numtyp)==sizeof(double))
+        memcpy(host_rad.begin(),rad,nall*sizeof(numtyp));
+      else {
+        for (int i=0; i<nall; i++) host_rad[i]=rad[i];
+      }
+    }
+  }
+
+  // Copy rad to device asynchronously
+  inline void add_rad_data() {
+    ucl_copy(dev_rad,host_rad,this->atom->nall(),true);
+  }
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+  
+  /// Pair loop with host neighboring
+  void compute(const int f_ago, const int inum_full, 
+               const int nall, double **host_x, int *host_type, 
+               int *ilist, int *numj, int **firstneigh, 
+               const bool eflag, const bool vflag,
+               const bool eatom, const bool vatom, int &host_start,
+               const double cpu_time, bool &success, double *rad);
+               
+  /// Pair loop with device neighboring
+  int** compute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, double *sublo,
+                double *subhi, int *tag, int **nspecial,
+                int **special, const bool eflag, const bool vflag, 
+                const bool eatom, const bool vatom, int &host_start, 
+                int **ilist, int **jnum, const double cpu_time, 
+                bool &success, double *rad);
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture rad_tex;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// coeff.x = a, coeff.y = offset, coeff.z = cutsq
+  UCL_D_Vec<numtyp4> coeff;
+  /// Special LJ values
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+  int _max_rad_size;
+
+  numtyp _kappa;
+
+  /// Per-atom arrays
+  UCL_H_Vec<numtyp> host_rad;
+  UCL_D_Vec<numtyp> dev_rad;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_yukawa_colloid_ext.cpp b/lib/gpu/lal_yukawa_colloid_ext.cpp
new file mode 100644
index 0000000000..c43166c33b
--- /dev/null
+++ b/lib/gpu/lal_yukawa_colloid_ext.cpp
@@ -0,0 +1,123 @@
+/***************************************************************************
+                             yukawa_colloid_ext.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Functions for LAMMPS access to colloid acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : 
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lal_yukawa_colloid.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static YukawaColloid<PRECISION,ACC_PRECISION> YKCOLLMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, 
+                       double **host_offset, double *special_lj, const int inum,
+                       const int nall, const int max_nbors,  const int maxspecial,
+                       const double cell_size, int &gpu_mode, FILE *screen, 
+                       const double kappa) {
+  YKCOLLMF.clear();
+  gpu_mode=YKCOLLMF.device->gpu_mode();
+  double gpu_split=YKCOLLMF.device->particle_split();
+  int first_gpu=YKCOLLMF.device->first_device();
+  int last_gpu=YKCOLLMF.device->last_device();
+  int world_me=YKCOLLMF.device->world_me();
+  int gpu_rank=YKCOLLMF.device->gpu_rank();
+  int procs_per_gpu=YKCOLLMF.device->procs_per_gpu();
+
+  YKCOLLMF.device->init_message(screen,"yukawa/colloid",first_gpu,last_gpu);
+
+  bool message=false;
+  if (YKCOLLMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, 
+                          inum, nall, 300, maxspecial, cell_size, gpu_split, 
+                          screen, kappa);
+
+  YKCOLLMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, 
+                            inum, nall, 300, maxspecial, cell_size, gpu_split, 
+                            screen, kappa);
+
+    YKCOLLMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    YKCOLLMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void ykcolloid_gpu_clear() {
+  YKCOLLMF.clear();
+}
+
+int ** ykcolloid_gpu_compute_n(const int ago, const int inum_full,
+                               const int nall, double **host_x, int *host_type,
+                               double *sublo, double *subhi, int *tag, int **nspecial,
+                               int **special, const bool eflag, const bool vflag,
+                               const bool eatom, const bool vatom, int &host_start,
+                               int **ilist, int **jnum, const double cpu_time,
+                               bool &success, double *host_rad) {
+  return YKCOLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                          subhi, tag, nspecial, special, eflag, vflag, eatom,
+                          vatom, host_start, ilist, jnum, cpu_time, success,
+                          host_rad);
+}  
+			
+void ykcolloid_gpu_compute(const int ago, const int inum_full, 
+                           const int nall, double **host_x, int *host_type, 
+                           int *ilist, int *numj, int **firstneigh, 
+                           const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           const double cpu_time, bool &success, double *host_rad) {
+  YKCOLLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                   firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,
+                   success,host_rad);
+}
+
+double ykcolloid_gpu_bytes() {
+  return YKCOLLMF.host_memory_usage();
+}
+
+