git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8694 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2012-08-21 13:59:15 +00:00
parent 31551d81fd
commit 647ea4c29f
49 changed files with 8851 additions and 0 deletions

224
lib/gpu/geryon/ucl_matrix.h Normal file
View File

@ -0,0 +1,224 @@
/***************************************************************************
ucl_matrix.h
-------------------
W. Michael Brown
Matrix Container on Host
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Thu May 10 2012
copyright : (C) 2012 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
This software is distributed under the Simplified BSD License.
----------------------------------------------------------------------- */
// Only allow this file to be included by CUDA and OpenCL specific headers
#ifdef _UCL_MAT_ALLOW
/// Matrix S-Object
template <class hosttype, class devtype>
class UCL_Matrix {
public:
// Traits for copying data
// MEM_TYPE is 0 for device, 1 for host, and 2 for image
enum traits {
DATA_TYPE = _UCL_DATA_ID<hosttype>::id,
MEM_TYPE = 1,
PADDED = 0,
ROW_MAJOR = 1,
VECTOR = 0
};
typedef hosttype data_type;
/// Host Allocation
UCL_H_Mat<hosttype> host;
/// Device Allocation
UCL_D_Mat<devtype> device;
UCL_Matrix() { }
~UCL_Matrix() { }
/// Construct with specied number of rows and columns
/** \sa alloc() **/
UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
/// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind1 parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
* - UCL_RW_OPTIMIZED - Memory can be pinned
* The kind2 parameter controls memory optimizations as follows:
* - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels
* \note When passing a command queue instead of a device, the device
* allocation is always performed. Even if the device shares memory
* with the host.
* \param cq Default command queue for operations copied from another mat
* \return UCL_SUCCESS if the memory allocation is successful **/
template <class mat_type>
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); }
/// Set up host matrix with specied # of rows/cols and reserve memory
/** The kind1 parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
* - UCL_RW_OPTIMIZED - Memory can be pinned
* The kind2 parameter controls memory optimizations as follows:
* - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels
* \param device Used to get the default command queue for operations
* \return UCL_SUCCESS if the memory allocation is successful **/
inline int alloc(const size_t rows, const size_t cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
/// Free memory and set size to 0
inline void clear()
{ host.clear(); device.clear(); }
/// Resize the allocation to contain cols elements
inline int resize(const int rows, const int cols) {
assert(host.kind()!=UCL_VIEW);
int err=host.resize(rows,cols);
if (err!=UCL_SUCCESS)
return err;
return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
dev_resize(device,host,_buffer,rows,cols);
}
/// Resize (only if bigger) the allocation to contain cols elements
inline int resize_ib(const int new_rows, const int new_cols)
{ if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols);
else return UCL_SUCCESS; }
/// Set each element to zero
inline void zero() { host.zero(); device.zero(); }
/// Set first n elements to zero
inline void zero(const int n) { host.zero(n); device.zero(n); }
/// Get the number of elements
inline size_t numel() const { return host.numel(); }
/// Get the number of rows
inline size_t rows() const { return host.rows(); }
/// Get the number of columns
inline size_t cols() const { return host.cols(); }
/// Get the memory usage (bytes) of the s-object (including any buffers)
inline size_t host_mem_usage()
{ return host.row_bytes()*host.rows()+_buffer.row_bytes()*_buffer.rows(); }
/// Get the memory usage (bytes) of the s-object (including any buffers)
inline size_t device_mem_usage()
{ return device.row_bytes()*device.rows(); }
/// Get element at index i
inline hosttype & operator[](const int i) { return host[i]; }
/// Get element at index i
inline const hosttype & operator[](const int i) const { return host[i]; }
/// 2D access (row should always be 0)
inline hosttype & operator()(const int row, const int col)
{ return host(row,col); }
/// 2D access (row should always be 0)
inline const hosttype & operator()(const int row, const int col) const
{ return host(row,col); }
/// Returns pointer to memory pointer for allocation on host
inline hosttype ** host_ptr() { return host.host_ptr(); }
/// Return the default command queue/stream associated with this data
inline command_queue & cq() { return host.cq(); }
/// Block until command_queue associated with matrix is complete
inline void sync() { host.sync(); }
///Get the size of a row on the host (including any padding) in elements
inline size_t row_size() const { return host.row_size(); }
/// Get the size of a row on the host(including any padding) in bytes
inline size_t row_bytes() const { return host.row_bytes(); }
/// Get the size on the host in bytes of 1 element
inline int element_size() const { return sizeof(hosttype); }
/// Update the allocation on the host asynchronously
inline void update_host()
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(host,device,_buffer,true); }
/// Update the allocation on the host (true for asynchronous copy)
inline void update_host(const bool async)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(host,device,_buffer,async); }
/// Update the allocation on the host (using command queue)
inline void update_host(command_queue &cq)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(host,device,_buffer,cq); }
/// Update the first n elements on the host (true for asynchronous copy)
inline void update_host(const int n, const bool async)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(host,device,n,_buffer,async); }
/// Update the first n elements on the host (using command queue)
inline void update_host(const int n, command_queue &cq)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(host,device,n,_buffer,cq); }
/// Update slice on the host (true for asynchronous copy)
inline void update_host(const int rows, const int cols, const bool async)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(host,device,rows,cols,_buffer,async); }
/// Update slice on the host (using command queue)
inline void update_host(const int rows, const int cols, command_queue &cq)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(host,device,rows,cols,_buffer,cq); }
/// Update the allocation on the device asynchronously
inline void update_device()
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(device,host,_buffer,true); }
/// Update the allocation on the device (true for asynchronous copy)
inline void update_device(const bool async)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(device,host,_buffer,async); }
/// Update the allocation on the device (using command queue)
inline void update_device(command_queue &cq)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(device,host,_buffer,cq); }
/// Update the first n elements on the device (true for asynchronous copy)
inline void update_device(const int n, const bool async)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(device,host,n,_buffer,async); }
/// Update the first n elements on the device (using command queue)
inline void update_device(const int n, command_queue &cq)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(device,host,n,_buffer,cq); }
/// Update slice on the device (true for asynchronous copy)
inline void update_device(const int rows, const int cols, const bool async)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(device,host,rows,cols,_buffer,async); }
/// Update slice on the device (using command queue)
inline void update_device(const int rows, const int cols, command_queue &cq)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(device,host,rows,cols,_buffer,cq); }
private:
UCL_H_Mat<devtype> _buffer;
};
#endif

View File

@ -0,0 +1,272 @@
/***************************************************************************
ucl_s_obj_help.h
-------------------
W. Michael Brown
Helper routines for allocating memory for s-objects and performing
host/device updates. (Different routines depending on whether the
same type is used on the host and device).
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Mon May 14 2012
copyright : (C) 2012 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
This software is distributed under the Simplified BSD License.
----------------------------------------------------------------------- */
template <int st> struct _ucl_s_obj_help;
// Host and device containers are same type
// -- Don't need casting buffers
// -- Can potentially use same memory if shared by accelerator
template <> struct _ucl_s_obj_help<1> {
template <class t1, class t2, class t3>
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
const int cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) {
int e1;
e1=host.alloc(cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
if (acc.shared_memory()) {
device.view(host);
return UCL_SUCCESS;
} else
return device.alloc(cols,acc,kind2);
}
template <class t1, class t2, class t3, class mat_type>
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
const int cols, mat_type &cq,
const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) {
int e1;
e1=host.alloc(cols,cq,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(cols,cq,kind2);
}
template <class t1, class t2, class t3>
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
const int rows, const int cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) {
int e1;
e1=host.alloc(rows,cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
if (acc.shared_memory()) {
device.view(host);
return UCL_SUCCESS;
} else
return device.alloc(rows,cols,acc,kind2);
}
template <class t1, class t2, class t3, class mat_type>
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
const int rows, const int cols, mat_type &cq,
const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) {
int e1;
e1=host.alloc(rows,cols,cq,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(rows,cols,cq,kind2);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) {
ucl_copy(dst,src,async);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) {
ucl_copy(dst,src,cq);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
const bool async) {
ucl_copy(dst,src,cols,async);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
command_queue &cq) {
ucl_copy(dst,src,cols,cq);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
t3 &buffer, const bool async) {
ucl_copy(dst,src,rows,cols,async);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
t3 &buffer, command_queue &cq) {
ucl_copy(dst,src,rows,cols,cq);
}
template <class t1, class t2, class t3>
static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
if (device.kind()==UCL_VIEW) {
device.view(host);
return UCL_SUCCESS;
} else
return device.resize(cols);
}
template <class t1, class t2, class t3>
static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows,
const int cols) {
if (device.kind()==UCL_VIEW) {
device.view(host);
return UCL_SUCCESS;
} else
return device.resize(rows,cols);
}
};
// Host and device containers are different types
template <int st> struct _ucl_s_obj_help {
template <class t1, class t2, class t3>
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
const int cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) {
int e1;
e1=host.alloc(cols,acc,UCL_NOT_PINNED);
if (e1!=UCL_SUCCESS)
return e1;
e1=_buffer.alloc(cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
if (acc.shared_memory()) {
device.view(_buffer);
return UCL_SUCCESS;
} else
return device.alloc(cols,acc,kind2);
}
template <class t1, class t2, class t3, class mat_type>
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
const int cols, mat_type &cq,
const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) {
int e1;
e1=host.alloc(cols,cq,UCL_NOT_PINNED);
if (e1!=UCL_SUCCESS)
return e1;
e1=_buffer.alloc(cols,cq,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(cols,cq,kind2);
}
template <class t1, class t2, class t3>
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
const int rows, const int cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) {
int e1;
e1=host.alloc(rows,cols,acc,UCL_NOT_PINNED);
if (e1!=UCL_SUCCESS)
return e1;
e1=_buffer.alloc(rows,cols,acc,kind1);
if (e1!=UCL_SUCCESS)
return e1;
if (acc.shared_memory()) {
device.view(_buffer);
return UCL_SUCCESS;
} else
return device.alloc(rows,cols,acc,kind2);
}
template <class t1, class t2, class t3, class mat_type>
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
const int rows, const int cols, mat_type &cq,
const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) {
int e1;
e1=host.alloc(rows,cols,cq,UCL_NOT_PINNED);
if (e1!=UCL_SUCCESS)
return e1;
e1=_buffer.alloc(rows,cols,cq,kind1);
if (e1!=UCL_SUCCESS)
return e1;
return device.alloc(rows,cols,cq,kind2);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) {
ucl_cast_copy(dst,src,buffer,async);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) {
ucl_cast_copy(dst,src,buffer,cq);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
const bool async) {
ucl_cast_copy(dst,src,cols,buffer,async);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
command_queue &cq) {
ucl_cast_copy(dst,src,cols,buffer,cq);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
t3 &buffer, const bool async) {
ucl_cast_copy(dst,src,rows,cols,buffer,async);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
t3 &buffer, command_queue &cq) {
ucl_cast_copy(dst,src,rows,cols,buffer,cq);
}
template <class t1, class t2, class t3>
static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
int err=buff.resize(cols);
if (err!=UCL_SUCCESS)
return err;
if (device.kind()==UCL_VIEW) {
device.view(buff);
return UCL_SUCCESS;
} else
return device.resize(cols);
}
template <class t1, class t2, class t3>
static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows,
const int cols) {
int err=buff.resize(rows,cols);
if (err!=UCL_SUCCESS)
return err;
if (device.kind()==UCL_VIEW) {
device.view(buff);
return UCL_SUCCESS;
} else
return device.resize(rows,cols);
}
};

223
lib/gpu/geryon/ucl_vector.h Normal file
View File

@ -0,0 +1,223 @@
/***************************************************************************
ucl_vector.h
-------------------
W. Michael Brown
Vector Container on Host
__________________________________________________________________________
This file is part of the Geryon Unified Coprocessor Library (UCL)
__________________________________________________________________________
begin : Thu May 10 2012
copyright : (C) 2012 by W. Michael Brown
email : brownw@ornl.gov
***************************************************************************/
/* -----------------------------------------------------------------------
This software is distributed under the Simplified BSD License.
----------------------------------------------------------------------- */
// Only allow this file to be included by CUDA and OpenCL specific headers
#ifdef _UCL_MAT_ALLOW
/// Row Vector S-Object
template <class hosttype, class devtype>
class UCL_Vector {
public:
// Traits for copying data
// MEM_TYPE is 0 for device, 1 for host, and 2 for image
enum traits {
DATA_TYPE = _UCL_DATA_ID<hosttype>::id,
MEM_TYPE = 1,
PADDED = 0,
ROW_MAJOR = 1,
VECTOR = 1
};
typedef hosttype data_type;
/// Host Allocation
UCL_H_Vec<hosttype> host;
/// Device Allocation
UCL_D_Vec<devtype> device;
UCL_Vector() { }
~UCL_Vector() { }
/// Construct with n columns
/** \sa alloc() **/
UCL_Vector(const size_t cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,cols,acc,kind1,kind2); }
/// Set up the vector with 'cols' columns and reserve memory
/** The kind1 parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
* - UCL_RW_OPTIMIZED - Memory can be pinned
* The kind2 parameter controls memory optimizations as follows:
* - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels
* \note When passing a command queue instead of a device, the device
* allocation is always performed. Even if the device shares memory
* with the host.
* \param cq Default command queue for operations copied from another mat
* \return UCL_SUCCESS if the memory allocation is successful **/
template <class mat_type>
inline int alloc(const size_t cols, mat_type &cq,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,cols,cq,kind1,kind2); }
/// Set up host vector with 'cols' columns and reserve memory
/** The kind1 parameter controls memory pinning as follows:
* - UCL_NOT_PINNED - Memory is not pinned
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
* - UCL_RW_OPTIMIZED - Memory can be pinned
* The kind2 parameter controls memory optimizations as follows:
* - UCL_READ_WRITE - Specify that you will read and write in kernels
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
* - UCL_READ_ONLY - Specify that you will only read in kernels
* \param device Used to get the default command queue for operations
* \return UCL_SUCCESS if the memory allocation is successful **/
inline int alloc(const size_t cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
alloc(host,device,_buffer,cols,acc,kind1,kind2); }
/// Free memory and set size to 0
inline void clear()
{ host.clear(); device.clear(); }
/// Resize the allocation to contain cols elements
inline int resize(const int cols) {
assert(host.kind()!=UCL_VIEW);
int err=host.resize(cols);
if (err!=UCL_SUCCESS)
return err;
return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
dev_resize(device,host,_buffer,cols);
}
/// Resize (only if bigger) the allocation to contain cols elements
inline int resize_ib(const int new_cols)
{ if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; }
/// Set each element to zero
inline void zero() { host.zero(); device.zero(); }
/// Set first n elements to zero
inline void zero(const int n) { host.zero(n); device.zero(n); }
/// Get the number of elements
inline size_t numel() const { return host.numel(); }
/// Get the number of rows
inline size_t rows() const { return host.rows(); }
/// Get the number of columns
inline size_t cols() const { return host.cols(); }
/// Get the memory usage (bytes) of the s-object (including any buffers)
inline size_t host_mem_usage()
{ return host.row_bytes()+_buffer.row_bytes(); }
/// Get the memory usage (bytes) of the s-object (including any buffers)
inline size_t device_mem_usage()
{ return device.row_bytes(); }
/// Get element at index i
inline hosttype & operator[](const int i) { return host[i]; }
/// Get element at index i
inline const hosttype & operator[](const int i) const { return host[i]; }
/// 2D access (row should always be 0)
inline hosttype & operator()(const int row, const int col)
{ return host[col]; }
/// 2D access (row should always be 0)
inline const hosttype & operator()(const int row, const int col) const
{ return host[col]; }
/// Returns pointer to memory pointer for allocation on host
inline hosttype ** host_ptr() { return host.host_ptr(); }
/// Return the default command queue/stream associated with this data
inline command_queue & cq() { return host.cq(); }
/// Block until command_queue associated with matrix is complete
inline void sync() { host.sync(); }
///Get the size of a row on the host (including any padding) in elements
inline size_t row_size() const { return host.row_size(); }
/// Get the size of a row on the host(including any padding) in bytes
inline size_t row_bytes() const { return host.row_bytes(); }
/// Get the size on the host in bytes of 1 element
inline int element_size() const { return sizeof(hosttype); }
/// Update the allocation on the host asynchronously
inline void update_host()
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(host,device,_buffer,true); }
/// Update the allocation on the host (true for asynchronous copy)
inline void update_host(const bool async)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(host,device,_buffer,async); }
/// Update the allocation on the host (using command queue)
inline void update_host(command_queue &cq)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(host,device,_buffer,cq); }
/// Update the first n elements on the host (true for asynchronous copy)
inline void update_host(const int n, const bool async)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(host,device,n,_buffer,async); }
/// Update the first n elements on the host (using command queue)
inline void update_host(const int n, command_queue &cq)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(host,device,n,_buffer,cq); }
/// Update slice on the host (true for asynchronous copy)
inline void update_host(const int rows, const int cols, const bool async)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(host,device,rows,cols,_buffer,async); }
/// Update slice on the host (using command queue)
inline void update_host(const int rows, const int cols, command_queue &cq)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(host,device,rows,cols,_buffer,cq); }
/// Update the allocation on the device asynchronously
inline void update_device()
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(device,host,_buffer,true); }
/// Update the allocation on the device (true for asynchronous copy)
inline void update_device(const bool async)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(device,host,_buffer,async); }
/// Update the allocation on the device (using command queue)
inline void update_device(command_queue &cq)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(device,host,_buffer,cq); }
/// Update the first n elements on the device (true for asynchronous copy)
inline void update_device(const int n, const bool async)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(device,host,n,_buffer,async); }
/// Update the first n elements on the device (using command queue)
inline void update_device(const int n, command_queue &cq)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(device,host,n,_buffer,cq); }
/// Update slice on the device (true for asynchronous copy)
inline void update_device(const int rows, const int cols, const bool async)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(device,host,rows,cols,_buffer,async); }
/// Update slice on the device (using command queue)
inline void update_device(const int rows, const int cols, command_queue &cq)
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
copy(device,host,rows,cols,_buffer,cq); }
private:
UCL_H_Vec<devtype> _buffer;
};
#endif

315
lib/gpu/lal_base_dipole.cpp Normal file
View File

@ -0,0 +1,315 @@
/***************************************************************************
base_dipole.cpp
-------------------
Trung Dac Nguyen (ORNL)
Base class for pair styles needing per-particle data for position,
dipole, and type.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#include "lal_base_dipole.h"
using namespace LAMMPS_AL;
#define BaseDipoleT BaseDipole<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp>
BaseDipoleT::BaseDipole() : _compiled(false), _max_bytes(0) {
device=&global_device;
ans=new Answer<numtyp,acctyp>();
nbor=new Neighbor();
}
template <class numtyp, class acctyp>
BaseDipoleT::~BaseDipole() {
delete ans;
delete nbor;
}
template <class numtyp, class acctyp>
int BaseDipoleT::bytes_per_atom_atomic(const int max_nbors) const {
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
nbor->bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
int BaseDipoleT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size,
const double gpu_split, FILE *_screen,
const void *pair_program,
const char *k_name) {
screen=_screen;
int gpu_nbor=0;
if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
gpu_nbor=1;
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
gpu_nbor=2;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
if (host_nlocal>0)
_gpu_host=1;
_threads_per_atom=device->threads_per_charge();
if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
_nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,false,
_threads_per_atom);
if (success!=0)
return success;
ucl_device=device->gpu;
atom=&device->atom;
_block_size=device->pair_block_size();
_block_bio_size=device->block_bio_pair();
compile_kernels(*ucl_device,pair_program,k_name);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split);
// Initialize timers for the selected GPU
time_pair.init(*ucl_device);
time_pair.zero();
pos_tex.bind_float(atom->x,4);
q_tex.bind_float(atom->q,1);
mu_tex.bind_float(atom->quat,4);
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
return success;
}
template <class numtyp, class acctyp>
void BaseDipoleT::estimate_gpu_overhead() {
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
}
template <class numtyp, class acctyp>
void BaseDipoleT::clear_atomic() {
// Output any timing information
acc_timers();
double avg_split=hd_balancer.all_avg_split();
_gpu_overhead*=hd_balancer.timestep();
_driver_overhead*=hd_balancer.timestep();
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
if (_compiled) {
k_pair_fast.clear();
k_pair.clear();
delete pair_program;
_compiled=false;
}
time_pair.clear();
hd_balancer.clear();
nbor->clear();
ans->clear();
device->clear();
}
// ---------------------------------------------------------------------------
// Copy neighbor list from host
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int * BaseDipoleT::reset_nbors(const int nall, const int inum, int *ilist,
int *numj, int **firstneigh, bool &success) {
success=true;
int mn=nbor->max_nbor_loop(inum,numj,ilist);
resize_atom(inum,nall,success);
resize_local(inum,mn,success);
if (!success)
return false;
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
return ilist;
}
// ---------------------------------------------------------------------------
// Build neighbor list on device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x,
int *host_type, double *sublo,
double *subhi, int *tag,
int **nspecial, int **special,
bool &success) {
success=true;
resize_atom(inum,nall,success);
resize_local(inum,host_inum,nbor->max_nbors(),success);
if (!success)
return;
atom->cast_copy_x(host_x,host_type);
int mn;
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
nspecial, special, success, mn);
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
}
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseDipoleT::compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
int &host_start, const double cpu_time,
bool &success, double *host_q, double **host_mu,
const int nlocal, double *boxlo, double *prd) {
acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
resize_atom(0,nall,success);
zero_timers();
return;
}
int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum);
host_start=inum;
if (ago==0) {
reset_nbors(nall, inum, ilist, numj, firstneigh, success);
if (!success)
return;
}
atom->cast_x_data(host_x,host_type);
atom->cast_q_data(host_q);
atom->cast_quat_data(host_mu[0]);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
atom->add_q_data();
atom->add_quat_data();
device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
boxlo, prd);
loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
device->add_ans_object(ans);
hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int** BaseDipoleT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag,
int **nspecial, int **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum,
const double cpu_time, bool &success,
double *host_q, double **host_mu,
double *boxlo, double *prd) {
acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
resize_atom(0,nall,success);
zero_timers();
return NULL;
}
hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
atom->cast_q_data(host_q);
atom->cast_quat_data(host_mu[0]);
hd_balancer.start_timer();
} else {
atom->cast_x_data(host_x,host_type);
atom->cast_q_data(host_q);
atom->cast_quat_data(host_mu[0]);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
}
atom->add_q_data();
atom->add_quat_data();
*ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin();
device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
boxlo, prd);
loop(eflag,vflag);
ans->copy_answers(eflag,vflag,eatom,vatom);
device->add_ans_object(ans);
hd_balancer.stop_timer();
return nbor->host_jlist.begin()-host_start;
}
template <class numtyp, class acctyp>
double BaseDipoleT::host_memory_usage_atomic() const {
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
4*sizeof(numtyp)+sizeof(BaseDipole<numtyp,acctyp>);
}
template <class numtyp, class acctyp>
void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *kname) {
if (_compiled)
return;
std::string s_fast=std::string(kname)+"_fast";
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,flags.c_str());
k_pair_fast.set_function(*pair_program,s_fast.c_str());
k_pair.set_function(*pair_program,kname);
pos_tex.get_texture(*pair_program,"pos_tex");
q_tex.get_texture(*pair_program,"q_tex");
mu_tex.get_texture(*pair_program,"mu_tex");
_compiled=true;
}
template class BaseDipole<PRECISION,ACC_PRECISION>;

200
lib/gpu/lal_base_dipole.h Normal file
View File

@ -0,0 +1,200 @@
/***************************************************************************
base_dipole.h
-------------------
Trung Dac Nguyen (ORNL)
Base class for pair styles needing per-particle data for position,
dipole, and type.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_BASE_DIPOLE_H
#define LAL_BASE_DIPOLE_H
#include "lal_device.h"
#include "lal_balance.h"
#include "mpi.h"
#ifdef USE_OPENCL
#include "geryon/ocl_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class BaseDipole {
public:
BaseDipole();
virtual ~BaseDipole();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
* \param k_name name for the kernel for force calculation
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const void *pair_program, const char *k_name);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead();
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(nall, success)) {
pos_tex.bind_float(atom->x,4);
q_tex.bind_float(atom->q,1);
mu_tex.bind_float(atom->quat,4);
}
ans->resize(inum,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note olist_size=total number of local particles **/
inline void resize_local(const int inum, const int max_nbors, bool &success) {
nbor->resize(inum,max_nbors,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/
inline void resize_local(const int inum, const int host_inum,
const int max_nbors, bool &success) {
nbor->resize(inum,host_inum,max_nbors,success);
}
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear_atomic();
/// Returns memory usage on device per atom
int bytes_per_atom_atomic(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage_atomic() const;
/// Accumulate timers
inline void acc_timers() {
if (device->time_device()) {
nbor->acc_timers();
time_pair.add_to_total();
atom->acc_timers();
ans->acc_timers();
}
}
/// Zero timers
inline void zero_timers() {
time_pair.zero();
atom->zero_timers();
ans->zero_timers();
}
/// Copy neighbor list from host
int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
int **firstneigh, bool &success);
/// Build neighbor list on device
void build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, bool &success);
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *charge,
double **mu, const int nlocal, double *boxlo, double *prd);
/// Pair loop with device neighboring
int** compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success,
double *charge, double **mu, double *boxlo, double *prd);
// -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage
Device<numtyp,acctyp> *device;
/// Geryon device
UCL_Device *ucl_device;
/// Device Timers
UCL_Timer time_pair;
/// Host device load balancer
Balance<numtyp,acctyp> hd_balancer;
/// LAMMPS pointer for screen output
FILE *screen;
// --------------------------- ATOM DATA --------------------------
/// Atom Data
Atom<numtyp,acctyp> *atom;
// ------------------------ FORCE/ENERGY DATA -----------------------
Answer<numtyp,acctyp> *ans;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data
Neighbor *nbor;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program;
UCL_Kernel k_pair_fast, k_pair;
inline int block_size() { return _block_size; }
// --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex;
UCL_Texture q_tex;
UCL_Texture mu_tex;
protected:
bool _compiled;
int _block_size, _block_bio_size, _threads_per_atom;
double _max_bytes, _max_an_bytes;
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
virtual void loop(const bool _eflag, const bool _vflag) = 0;
};
}
#endif

162
lib/gpu/lal_born.cpp Normal file
View File

@ -0,0 +1,162 @@
/***************************************************************************
born.cpp
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the born pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "born_cl.h"
#elif defined(USE_CUDART)
const char *born=0;
#else
#include "born_cubin.h"
#endif
#include "lal_born.h"
#include <cassert>
using namespace LAMMPS_AL;
#define BornT Born<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
BornT::Born() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
}
template <class numtyp, class acctyp>
BornT::~Born() {
clear();
}
template <class numtyp, class acctyp>
int BornT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int BornT::init(const int ntypes, double **host_cutsq,
double **host_rhoinv, double **host_born1, double **host_born2,
double **host_born3, double **host_a, double **host_c,
double **host_d, double **host_sigma,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,born,"k_born");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv,
host_born1,host_born2,host_born3);
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
host_d,host_offset);
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack2(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
host_sigma);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()
+cutsq_sigma.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void BornT::clear() {
if (!_allocated)
return;
_allocated=false;
coeff1.clear();
coeff2.clear();
cutsq_sigma.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double BornT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(Born<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BornT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &coeff1,&coeff2,
&cutsq_sigma, &sp_lj,
&this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &coeff1, &coeff2,
&cutsq_sigma, &_lj_types, &sp_lj,
&this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->ans->force,
&this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class Born<PRECISION,ACC_PRECISION>;

201
lib/gpu/lal_born.cu Normal file
View File

@ -0,0 +1,201 @@
// **************************************************************************
// born.cu
// -------------------
// Trung Dac Nguyen (ORNL)
//
// Device code for acceleration of the born pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
#else
#define pos_tex x_
#endif
__kernel void k_born(__global numtyp4 *x_, __global numtyp4 *coeff1,
__global numtyp4* coeff2,
__global numtyp2 *cutsq_sigma,
const int lj_types, __global numtyp *sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (r2inv<cutsq_sigma[mtype].x) {
numtyp r=ucl_sqrt(r2inv);
numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
r2inv=ucl_recip(r2inv);
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r2inv*(coeff1[mtype].y*r*rexp
- coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
force*=factor_lj;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+ coeff2[mtype].z*r2inv*r6inv;
energy+=factor_lj*(e-coeff2[mtype].w);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv);
} // if ii
}
__kernel void k_born_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__global numtyp4* coeff2_in,
__global numtyp2 *cutsq_sigma,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (tid<4)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
coeff1[tid]=coeff1_in[tid];
if (eflag>0)
coeff2[tid]=coeff2_in[tid];
}
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp r2inv = delx*delx+dely*dely+delz*delz;
if (r2inv<cutsq_sigma[mtype].x) {
numtyp r=ucl_sqrt(r2inv);
numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
r2inv=ucl_recip(r2inv);
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r2inv*(coeff1[mtype].y*r*rexp
- coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
force*=factor_lj;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+ coeff2[mtype].z*r2inv*r6inv;
energy+=factor_lj*(e-coeff2[mtype].w);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv);
} // if ii
}

84
lib/gpu/lal_born.h Normal file
View File

@ -0,0 +1,84 @@
/***************************************************************************
born.h
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the born pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_BORN_H
#define LAL_BORN_H
#include "lal_base_atomic.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class Born : public BaseAtomic<numtyp, acctyp> {
public:
Born();
~Born();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq,
double **host_rhoinv, double **host_born1, double **host_born2,
double **host_born3, double **host_a, double **host_c,
double **host_d, double **host_sigma,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
/// coeff1.w = born3
UCL_D_Vec<numtyp4> coeff1;
/// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
UCL_D_Vec<numtyp4> coeff2;
/// cutsq_sigma
UCL_D_Vec<numtyp2> cutsq_sigma;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

View File

@ -0,0 +1,175 @@
/***************************************************************************
born_coul_long.cpp
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the born/coul/long pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "born_coul_long_cl.h"
#elif defined(USE_CUDART)
const char *born_coul_long=0;
#else
#include "born_coul_long_cubin.h"
#endif
#include "lal_born_coul_long.h"
#include <cassert>
using namespace LAMMPS_AL;
#define BornCoulLongT BornCoulLong<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
BornCoulLongT::BornCoulLong() : BaseCharge<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
BornCoulLongT::~BornCoulLongT() {
clear();
}
template <class numtyp, class acctyp>
int BornCoulLongT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
double **host_born1, double **host_born2, double **host_born3,
double **host_a, double **host_c, double **host_d,
double **host_sigma, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq, const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,born_coul_long,"k_born_long");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv,
host_born1,host_born2,host_born3);
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
host_d,host_offset);
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
host_cut_ljsq,host_sigma);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i];
host_write[i+4]=host_special_coul[i];
}
ucl_copy(sp_lj,host_write,8,false);
_cut_coulsq=host_cut_coulsq;
_qqrd2e=qqrd2e;
_g_ewald=g_ewald;
_allocated=true;
this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()
+cutsq_sigma.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void BornCoulLongT::clear() {
if (!_allocated)
return;
_allocated=false;
coeff1.clear();
coeff2.clear();
cutsq_sigma.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double BornCoulLongT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(BornCoulLong<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
&this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->ans->force,
&this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q,
&cutsq_sigma, &_cut_coulsq, &_qqrd2e,
&_g_ewald, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->q,
&cutsq_sigma, &_cut_coulsq,
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class BornCoulLong<PRECISION,ACC_PRECISION>;

View File

@ -0,0 +1,262 @@
// **************************************************************************
// buck_coul_long.cu
// -------------------
// Trung Dac Nguyen (ORNL)
//
// Device code for acceleration of the buck/coul/long pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
#endif
#else
#define pos_tex x_
#define q_tex q_
#endif
__kernel void k_born_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
__global numtyp4* coeff2, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp4 *cutsq_sigma,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<cutsq_sigma[mtype].x) { // cutsq
numtyp r2inv = ucl_recip(rsq);
numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
numtyp rexp = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r = ucl_rsqrt(r2inv);
numtyp grij = g_ewald * r;
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else forcecoul = (numtyp)0.0;
if (rsq < cutsq_sigma[mtype].y) {
numtyp r = ucl_sqrt(rsq);
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
r6inv = r2inv*r2inv*r2inv;
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj;
} else forceborn = (numtyp)0.0;
force = (forceborn + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul);
if (rsq < coeff1[mtype].w) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+ coeff2[mtype].z*r2inv*r6inv;
energy+=factor_lj*(e-coeff2[mtype].w);
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}
__kernel void k_born_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__global numtyp4* coeff2_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp4 *cutsq_sigma,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp g_ewald, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
coeff1[tid]=coeff1_in[tid];
if (eflag>0)
coeff2[tid]=coeff2_in[tid];
}
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq_sigma[mtype].x) {
numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
numtyp rexp = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r=ucl_rsqrt(r2inv);
numtyp grij = g_ewald * r;
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
} else forcecoul = (numtyp)0.0;
if (rsq < cutsq_sigma[mtype].y) {
numtyp r = ucl_sqrt(rsq);
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
r6inv = r2inv*r2inv*r2inv;
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj;
} else forceborn = (numtyp)0.0;
force = (forceborn + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += prefactor*(_erfc-factor_coul);
if (rsq < coeff1[mtype].w) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+ coeff2[mtype].z*r2inv*r6inv;
energy+=factor_lj*(e-coeff2[mtype].w);
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}

View File

@ -0,0 +1,88 @@
/***************************************************************************
born_coul_long.h
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the born/coul/long pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_BORN_COUL_LONG_H
#define LAL_BORN_COUL_LONG_H
#include "lal_base_charge.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class BornCoulLong : public BaseCharge<numtyp, acctyp> {
public:
BornCoulLong();
~BornCoulLong();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
double **host_born1, double **host_born2, double **host_born3,
double **host_a, double **host_c, double **host_d,
double **host_sigma, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
/// coeff1.w = born3
UCL_D_Vec<numtyp4> coeff1;
/// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
UCL_D_Vec<numtyp4> coeff2;
/// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj,
/// cutsq_sigma.z = sigma
UCL_D_Vec<numtyp4> cutsq_sigma;
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _cut_coulsq, _qqrd2e, _g_ewald;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

View File

@ -0,0 +1,132 @@
/***************************************************************************
born_coul_long_ext.cpp
-------------------
Trung Dac Nguyen (ORNL)
Functions for LAMMPS access to born/coul/long acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lal_born_coul_long.h"
using namespace std;
using namespace LAMMPS_AL;
static BornCoulLong<PRECISION,ACC_PRECISION> BORNCLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
double **host_born1, double **host_born2, double **host_born3,
double **host_a, double **host_c, double **host_d,
double **sigma, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial, const double cell_size, int &gpu_mode,
FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald) {
BORNCLMF.clear();
gpu_mode=BORNCLMF.device->gpu_mode();
double gpu_split=BORNCLMF.device->particle_split();
int first_gpu=BORNCLMF.device->first_device();
int last_gpu=BORNCLMF.device->last_device();
int world_me=BORNCLMF.device->world_me();
int gpu_rank=BORNCLMF.device->gpu_rank();
int procs_per_gpu=BORNCLMF.device->procs_per_gpu();
BORNCLMF.device->init_message(screen,"born/coul/long",first_gpu,last_gpu);
bool message=false;
if (BORNCLMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, offset,
special_lj, inum, nall, 300, maxspecial, cell_size,
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e, g_ewald);
BORNCLMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma, offset,
special_lj, inum, nall, 300, maxspecial, cell_size,
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
host_special_coul, qqrd2e, g_ewald);
BORNCLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
BORNCLMF.estimate_gpu_overhead();
return init_ok;
}
void borncl_gpu_clear() {
BORNCLMF.clear();
}
int** borncl_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return BORNCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void borncl_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
BORNCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
host_q,nlocal,boxlo,prd);
}
double borncl_gpu_bytes() {
return BORNCLMF.host_memory_usage();
}

View File

@ -0,0 +1,176 @@
/***************************************************************************
born_coul_wolf.cpp
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the born/coul/wolf pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "born_coul_wolf_cl.h"
#elif defined(USE_CUDART)
const char *born_coul_wolf=0;
#else
#include "born_coul_wolf_cubin.h"
#endif
#include "lal_born_coul_wolf.h"
#include <cassert>
using namespace LAMMPS_AL;
#define BornCoulWolfT BornCoulWolf<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
BornCoulWolfT::BornCoulWolf() : BaseCharge<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
BornCoulWolfT::~BornCoulWolfT() {
clear();
}
template <class numtyp, class acctyp>
int BornCoulWolfT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
double **host_born1, double **host_born2, double **host_born3,
double **host_a, double **host_c, double **host_d,
double **host_sigma, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq, const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double alf, const double e_shift, const double f_shift) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,born_coul_wolf,"k_born_wolf");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv,
host_born1,host_born2,host_born3);
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
host_d,host_offset);
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
host_cut_ljsq,host_sigma);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i];
host_write[i+4]=host_special_coul[i];
}
ucl_copy(sp_lj,host_write,8,false);
_cut_coulsq=host_cut_coulsq;
_qqrd2e=qqrd2e;
_alf=alf;
_e_shift=e_shift;
_f_shift=f_shift;
_allocated=true;
this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()
+cutsq_sigma.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void BornCoulWolfT::clear() {
if (!_allocated)
return;
_allocated=false;
coeff1.clear();
coeff2.clear();
cutsq_sigma.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double BornCoulWolfT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(BornCoulWolf<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q,
&cutsq_sigma, &_cut_coulsq, &_qqrd2e,
&_alf, &_e_shift, &_f_shift,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->q,
&cutsq_sigma, &_cut_coulsq,
&_qqrd2e, &_alf, &_e_shift, &_f_shift,
&this->_threads_per_atom);
}
this->time_pair.stop();
}
template class BornCoulWolf<PRECISION,ACC_PRECISION>;

View File

@ -0,0 +1,282 @@
// **************************************************************************
// born_coul_wolf.cu
// -------------------
// Trung Dac Nguyen (ORNL)
//
// Device code for acceleration of the born/coul/wolf pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
#endif
#else
#define pos_tex x_
#define q_tex q_
#endif
#define MY_PIS (acctyp)1.77245385090551602729
__kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
__global numtyp4* coeff2, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp4 *cutsq_sigma,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp alf, const numtyp e_shift,
const numtyp f_shift, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
if (eflag>0) {
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
e_coul += (acctyp)2.0*e_self;
}
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<cutsq_sigma[mtype].x) { // cutsq
numtyp r2inv = ucl_recip(rsq);
numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
numtyp v_sh = (numtyp)0.0;
numtyp rexp = (numtyp)0.0;
if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
numtyp r = ucl_sqrt(rsq);
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
r6inv = r2inv*r2inv*r2inv;
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj;
} else forceborn = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r=ucl_rsqrt(r2inv);
numtyp arij = alf * r;
numtyp expm2 = ucl_exp(-arij*arij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*arij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
v_sh = (_erfc - e_shift*r)*prefactor;
numtyp dvdrr = (_erfc/rsq + EWALD_F*alf*expm2/r) + f_shift;
forcecoul = prefactor * (dvdrr*rsq-factor_coul);
} else forcecoul = (numtyp)0.0;
force = (forceborn + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += prefactor*(v_sh-factor_coul);
if (rsq < coeff1[mtype].w) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+ coeff2[mtype].z*r2inv*r6inv;
energy+=factor_lj*(e-coeff2[mtype].w);
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}
__kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
__global numtyp4* coeff2_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp4 *cutsq_sigma,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp alf, const numtyp e_shift,
const numtyp f_shift, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
coeff1[tid]=coeff1_in[tid];
if (eflag>0)
coeff2[tid]=coeff2_in[tid];
}
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
if (eflag>0) {
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
e_coul += (acctyp)2.0*e_self;
}
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq_sigma[mtype].x) {
numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
numtyp v_sh = (numtyp)0.0;
numtyp rexp = (numtyp)0.0;
if (rsq < coeff1[mtype].w) {
numtyp r = ucl_sqrt(rsq);
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
r6inv = r2inv*r2inv*r2inv;
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj;
} else forceborn = (numtyp)0.0;
if (rsq < cut_coulsq) {
numtyp r=ucl_rsqrt(r2inv);
numtyp arij = alf * r;
numtyp expm2 = ucl_exp(-arij*arij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*arij);
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp/r;
v_sh = (_erfc - e_shift*r)*prefactor;
numtyp dvdrr = (_erfc/rsq + EWALD_F*alf*expm2/r) + f_shift;
forcecoul = prefactor * (dvdrr*rsq-factor_coul);
} else forcecoul = (numtyp)0.0;
force = (forceborn + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq)
e_coul += prefactor*(v_sh-factor_coul);
if (rsq < coeff1[mtype].w) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+ coeff2[mtype].z*r2inv*r6inv;
energy+=factor_lj*(e-coeff2[mtype].w);
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}

View File

@ -0,0 +1,89 @@
/***************************************************************************
born_coul_wolf.h
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the born/coul/wolf pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_BORN_COUL_LONG_H
#define LAL_BORN_COUL_LONG_H
#include "lal_base_charge.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
public:
BornCoulWolf();
~BornCoulWolf();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
double **host_born1, double **host_born2, double **host_born3,
double **host_a, double **host_c, double **host_d,
double **host_sigma, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double alf, const double e_shift,
const double f_shift);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
/// coeff1.w = born3
UCL_D_Vec<numtyp4> coeff1;
/// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
UCL_D_Vec<numtyp4> coeff2;
/// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj,
/// cutsq_sigma.z = sigma
UCL_D_Vec<numtyp4> cutsq_sigma;
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _cut_coulsq,_qqrd2e,_alf,_e_shift,_f_shift;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

View File

@ -0,0 +1,134 @@
/***************************************************************************
born_coul_wolf_ext.cpp
-------------------
Trung Dac Nguyen (ORNL)
Functions for LAMMPS access to born/coul/wolf acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lal_born_coul_wolf.h"
using namespace std;
using namespace LAMMPS_AL;
static BornCoulWolf<PRECISION,ACC_PRECISION> BORNCWMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
double **host_born1, double **host_born2, double **host_born3,
double **host_a, double **host_c, double **host_d,
double **sigma, double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double alf, const double e_shift, const double f_shift) {
BORNCWMF.clear();
gpu_mode=BORNCWMF.device->gpu_mode();
double gpu_split=BORNCWMF.device->particle_split();
int first_gpu=BORNCWMF.device->first_device();
int last_gpu=BORNCWMF.device->last_device();
int world_me=BORNCWMF.device->world_me();
int gpu_rank=BORNCWMF.device->gpu_rank();
int procs_per_gpu=BORNCWMF.device->procs_per_gpu();
BORNCWMF.device->init_message(screen,"born/coul/wolf",first_gpu,last_gpu);
bool message=false;
if (BORNCWMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma,
offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e,
alf, e_shift, f_shift);
BORNCWMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma,
offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e,
alf, e_shift, f_shift);
BORNCWMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
BORNCWMF.estimate_gpu_overhead();
return init_ok;
}
void borncw_gpu_clear() {
BORNCWMF.clear();
}
int** borncw_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return BORNCWMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void borncw_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
BORNCWMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
host_q,nlocal,boxlo,prd);
}
double borncw_gpu_bytes() {
return BORNCWMF.host_memory_usage();
}

124
lib/gpu/lal_born_ext.cpp Normal file
View File

@ -0,0 +1,124 @@
/***************************************************************************
born_ext.cpp
-------------------
Trung Dac Nguyen (ORNL)
Functions for LAMMPS access to born acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lal_born.h"
using namespace std;
using namespace LAMMPS_AL;
static Born<PRECISION,ACC_PRECISION> BORNMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
double **host_born1, double **host_born2,
double **host_born3, double **host_a, double **host_c,
double **host_d, double **sigma,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) {
BORNMF.clear();
gpu_mode=BORNMF.device->gpu_mode();
double gpu_split=BORNMF.device->particle_split();
int first_gpu=BORNMF.device->first_device();
int last_gpu=BORNMF.device->last_device();
int world_me=BORNMF.device->world_me();
int gpu_rank=BORNMF.device->gpu_rank();
int procs_per_gpu=BORNMF.device->procs_per_gpu();
BORNMF.device->init_message(screen,"born",first_gpu,last_gpu);
bool message=false;
if (BORNMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma,
offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
BORNMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
host_born3, host_a, host_c, host_d, sigma,
offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
BORNMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
BORNMF.estimate_gpu_overhead();
return init_ok;
}
void born_gpu_clear() {
BORNMF.clear();
}
int ** born_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success) {
return BORNMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success);
}
void born_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
BORNMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}
double born_gpu_bytes() {
return BORNMF.host_memory_usage();
}

181
lib/gpu/lal_colloid.cpp Normal file
View File

@ -0,0 +1,181 @@
/***************************************************************************
colloid.cpp
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the colloid pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "colloid_cl.h"
#elif defined(USE_CUDART)
const char *colloid=0;
#else
#include "colloid_cubin.h"
#endif
#include "lal_colloid.h"
#include <cassert>
using namespace LAMMPS_AL;
#define ColloidT Colloid<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
ColloidT::Colloid() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
}
template <class numtyp, class acctyp>
ColloidT::~Colloid() {
clear();
}
template <class numtyp, class acctyp>
int ColloidT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int ColloidT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, double **host_a12,
double **host_a1, double **host_a2,
double **host_d1, double **host_d2,
double **host_sigma3, double **host_sigma6,
int **host_form, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,colloid,"k_colloid");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
colloid1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,colloid1,host_write,host_a12,host_a1,
host_a2);
colloid2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,colloid2,host_write,host_d1,host_d2,
host_sigma3,host_sigma6);
UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0;
form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<ntypes; i++)
for (int j=0; j<ntypes; j++) {
dview_form[i*lj_types+j]=host_form[i][j];
}
ucl_copy(form,dview_form,false);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()
+colloid1.row_bytes()+colloid2.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void ColloidT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
colloid1.clear();
colloid2.clear();
form.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double ColloidT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(Colloid<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void ColloidT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
&colloid1, &colloid2, &form,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
&colloid1, &colloid2, &form,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class Colloid<PRECISION,ACC_PRECISION>;

329
lib/gpu/lal_colloid.cu Normal file
View File

@ -0,0 +1,329 @@
// **************************************************************************
// colloid.cu
// -------------------
// Trung Dac Nguyen (ORNL)
//
// Device code for acceleration of the colloid pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
#else
#define pos_tex x_
#endif
__kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in,
__global numtyp4* colloid1,
__global numtyp4* colloid2,
__global int *form,
__global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<lj1[mtype].z) {
numtyp r,r2inv,r6inv;
numtyp c1,c2,fR,evdwl;
numtyp K[9],h[4],g[4];
numtyp force = (numtyp)0;
if (form[mtype]==0) { // SMALL_SMALL
r2inv=ucl_recip(rsq);
r6inv = r2inv*r2inv*r2inv;
force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
force*=factor_lj;
} else if (form[mtype]==1) { // SMALL_LARGE
c2 = colloid1[mtype].z;
K[1] = c2*c2;
K[2] = rsq;
K[0] = K[1] - rsq;
K[4] = rsq*rsq;
K[3] = K[1] - K[2];
K[3] *= K[3]*K[3];
K[6] = K[3]*K[3];
fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
force = 4.0/15.0*fR *
(2.0*(K[1]+K[2]) * (K[1]*(5.0*K[1]+22.0*K[2])+5.0*K[4]) *
colloid2[mtype].w/K[6]-5.0) / K[0];
force*=factor_lj;
} else if (form[mtype]==2) { // LARGE_LARGE
r = ucl_sqrt(rsq);
c1 = colloid1[mtype].y;
c2 = colloid1[mtype].z;
K[0] = c1*c2;
K[1] = c1+c2;
K[2] = c1-c2;
K[3] = K[1]+r;
K[4] = K[1]-r;
K[5] = K[2]+r;
K[6] = K[2]-r;
K[7] = ucl_recip(K[3]*K[4]);
K[8] = ucl_recip(K[5]*K[6]);
g[0] = ucl_powr(K[3],(numtyp)-7.0);
g[1] = -ucl_powr(-K[4],(numtyp)-7.0);
g[2] = ucl_powr(K[5],(numtyp)-7.0);
g[3] = -ucl_powr(-K[6],(numtyp)-7.0);
h[0] = ((K[3]+(numtyp)5.0*K[1])*K[3]+(numtyp)30.0*K[0])*g[0];
h[1] = ((K[4]+(numtyp)5.0*K[1])*K[4]+(numtyp)30.0*K[0])*g[1];
h[2] = ((K[5]+(numtyp)5.0*K[2])*K[5]-(numtyp)30.0*K[0])*g[2];
h[3] = ((K[6]+(numtyp)5.0*K[2])*K[6]-(numtyp)30.0*K[0])*g[3];
g[0] *= (numtyp)42.0*K[0]/K[3]+(numtyp)6.0*K[1]+K[3];
g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4];
g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5];
g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6];
fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
numtyp dUA = -colloid1[mtype].x/3.0*r*(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
force = factor_lj * (dUR+dUA)/r;
}
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=(numtyp)0.0;
if (form[mtype]==0) {
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
} else if (form[mtype]==1) {
e=(numtyp)2.0/(numtyp)9.0*fR *
((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) +
(numtyp)4.2*K[4])+K[2]*K[4]) * colloid2[mtype].w/K[6]);
} else if (form[mtype]==2) {
e=evdwl+colloid1[mtype].x/(numtyp)6.0 * ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
}
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv);
} // if ii
}
__kernel void k_colloid_fast(__global numtyp4 *x_,
__global numtyp4 *lj1_in,
__global numtyp4 *lj3_in,
__global numtyp *sp_lj_in,
__global numtyp4 *colloid1_in,
__global numtyp4 *colloid2_in,
__global int *form_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 colloid1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 colloid2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (tid<4)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
colloid1[tid]=colloid1_in[tid];
colloid2[tid]=colloid2_in[tid];
form[tid]=form_in[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<lj1[mtype].z) {
numtyp r,r2inv,r6inv;
numtyp c1,c2,fR,evdwl;
numtyp K[9],h[4],g[4];
numtyp force = (numtyp)0;
if (form[mtype]==0) { // SMALL_SMALL
r2inv=ucl_recip(rsq);
r6inv = r2inv*r2inv*r2inv;
force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
force*=factor_lj;
} else if (form[mtype]==1) { // SMALL_LARGE
c2 = colloid1[mtype].z;
K[1] = c2*c2;
K[2] = rsq;
K[0] = K[1] - rsq;
K[4] = rsq*rsq;
K[3] = K[1] - K[2];
K[3] *= K[3]*K[3];
K[6] = K[3]*K[3];
fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
force = (numtyp)4.0/(numtyp)15.0*fR *
((numtyp)2.0*(K[1]+K[2]) * (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
force*=factor_lj;
} else if (form[mtype]==2) { // LARGE_LARGE
r = ucl_sqrt(rsq);
c1 = colloid1[mtype].y;
c2 = colloid1[mtype].z;
K[0] = c1*c2;
K[1] = c1+c2;
K[2] = c1-c2;
K[3] = K[1]+r;
K[4] = K[1]-r;
K[5] = K[2]+r;
K[6] = K[2]-r;
K[7] = ucl_recip(K[3]*K[4]);
K[8] = ucl_recip(K[5]*K[6]);
g[0] = ucl_powr(K[3],(numtyp)-7.0);
g[1] = -ucl_powr(-K[4],(numtyp)-7.0);
g[2] = ucl_powr(K[5],(numtyp)-7.0);
g[3] = -ucl_powr(-K[6],(numtyp)-7.0);
h[0] = ((K[3]+(numtyp)5.0*K[1])*K[3]+(numtyp)30.0*K[0])*g[0];
h[1] = ((K[4]+(numtyp)5.0*K[1])*K[4]+(numtyp)30.0*K[0])*g[1];
h[2] = ((K[5]+(numtyp)5.0*K[2])*K[5]-(numtyp)30.0*K[0])*g[2];
h[3] = ((K[6]+(numtyp)5.0*K[2])*K[6]-(numtyp)30.0*K[0])*g[3];
g[0] *= (numtyp)42.0*K[0]/K[3]+(numtyp)6.0*K[1]+K[3];
g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4];
g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5];
g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6];
fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
force = factor_lj * (dUR+dUA)/r;
} else force = (numtyp)0.0;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=(numtyp)0.0;
if (form[mtype]==0) {
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
} else if (form[mtype]==1) {
e=(numtyp)2.0/(numtyp)9.0*fR *
((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])*
colloid2[mtype].w/K[6]);
} else if (form[mtype]==2) {
e=evdwl+colloid1[mtype].x/(numtyp)6.0 * ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
}
energy+=factor_lj*(e-lj3[mtype].z);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv);
} // if ii
}

89
lib/gpu/lal_colloid.h Normal file
View File

@ -0,0 +1,89 @@
/***************************************************************************
colloid.h
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the colloid pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_COLLOID_H
#define LAL_COLLOID_H
#include "lal_base_atomic.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class Colloid : public BaseAtomic<numtyp, acctyp> {
public:
Colloid();
~Colloid();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq,
double **host_lj1, double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset, double *host_special_lj,
double **host_a12, double **host_a1, double **host_a2,
double **host_d1, double **host_d2, double **host_sigma3,
double **host_sigma6, int **host_form,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// colloid1.x = a12, colloid1.y = a1, colloid1.z = a2
UCL_D_Vec<numtyp4> colloid1;
/// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3,
/// colloid2.w = sigma6
UCL_D_Vec<numtyp4> colloid2;
/// form
UCL_D_Vec<int> form;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

127
lib/gpu/lal_colloid_ext.cpp Normal file
View File

@ -0,0 +1,127 @@
/***************************************************************************
colloid_ext.cpp
-------------------
Trung Dac Nguyen (ORNL)
Functions for LAMMPS access to colloid acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lal_colloid.h"
using namespace std;
using namespace LAMMPS_AL;
static Colloid<PRECISION,ACC_PRECISION> COLLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj,
double **host_a12, double **host_a1, double **host_a2,
double **host_d1, double **host_d2, double **host_sigma3,
double **host_sigma6, int **host_form, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) {
COLLMF.clear();
gpu_mode=COLLMF.device->gpu_mode();
double gpu_split=COLLMF.device->particle_split();
int first_gpu=COLLMF.device->first_device();
int last_gpu=COLLMF.device->last_device();
int world_me=COLLMF.device->world_me();
int gpu_rank=COLLMF.device->gpu_rank();
int procs_per_gpu=COLLMF.device->procs_per_gpu();
COLLMF.device->init_message(screen,"colloid",first_gpu,last_gpu);
bool message=false;
if (COLLMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, host_a12, host_a1,
host_a2, host_d1, host_d2, host_sigma3,
host_sigma6, host_form, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
COLLMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, host_a12, host_a1, host_a2,
host_d1, host_d2, host_sigma3, host_sigma6, host_form,
inum, nall, 300, maxspecial,
cell_size, gpu_split, screen);
COLLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
COLLMF.estimate_gpu_overhead();
return init_ok;
}
void colloid_gpu_clear() {
COLLMF.clear();
}
int ** colloid_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success) {
return COLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success);
}
void colloid_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
COLLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}
double colloid_gpu_bytes() {
return COLLMF.host_memory_usage();
}

153
lib/gpu/lal_coul_dsf.cpp Normal file
View File

@ -0,0 +1,153 @@
/***************************************************************************
coul_dsf.cpp
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the coul/dsf pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : 8/15/2012
email : nguyentdw@ornl.gov
***************************************************************************/
#if defined(USE_OPENCL)
#include "coul_dsf_cl.h"
#elif defined(USE_CUDART)
const char *coul_dsf=0;
#else
#include "coul_dsf_cubin.h"
#endif
#include "lal_coul_dsf.h"
#include <cassert>
using namespace LAMMPS_AL;
#define CoulDSFT CoulDSF<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
CoulDSFT::CoulDSF() : BaseCharge<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
CoulDSFT::~CoulDSF() {
clear();
}
template <class numtyp, class acctyp>
int CoulDSFT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int CoulDSFT::init(const int ntypes, const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split, FILE *_screen,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double e_shift, const double f_shift,
const double alpha) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,coul_dsf,"k_coul_dsf");
if (success!=0)
return success;
_cut_coulsq=host_cut_coulsq;
_e_shift=e_shift;
_f_shift=f_shift;
_alpha=alpha;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_coul[i];
}
ucl_copy(sp_lj,host_write,4,false);
_qqrd2e=qqrd2e;
_allocated=true;
this->_max_bytes=sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void CoulDSFT::clear() {
if (!_allocated)
return;
_allocated=false;
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double CoulDSFT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(CoulDSF<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
&this->_threads_per_atom);
}
this->time_pair.stop();
}
template class CoulDSF<PRECISION,ACC_PRECISION>;

214
lib/gpu/lal_coul_dsf.cu Normal file
View File

@ -0,0 +1,214 @@
// **************************************************************************
// coul_dsf.cu
// -------------------
// Trung Dac Nguyen (ORNL)
//
// Device code for acceleration of the coul/dsf pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin : 8/15/2012
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
#endif
#else
#define pos_tex x_
#define q_tex q_
#endif
#define MY_PIS (acctyp)1.77245385090551602729
__kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_ ,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp e_shift, const numtyp f_shift,
const numtyp alpha, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_coul, r, prefactor, erfcc;
factor_coul = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq < cut_coulsq) {
numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, force;
r = ucl_sqrt(rsq);
fetch(prefactor,j,q_tex);
prefactor *= factor_coul * qqrd2e*qtmp/r;
numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd +
rsq*f_shift);
force = forcecoul * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq) {
numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift);
e_coul += e;
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}
__kernel void k_coul_dsf_fast(__global numtyp4 *x_, __global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp e_shift, const numtyp f_shift,
const numtyp alpha, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4];
if (tid<4)
sp_lj[tid]=sp_lj_in[tid];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_coul, r, prefactor, erfcc;
factor_coul = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq < cut_coulsq) {
numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, force;
r = ucl_sqrt(rsq);
fetch(prefactor,j,q_tex);
prefactor *= factor_coul * qqrd2e*qtmp/r;
numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd +
rsq*f_shift);
force = forcecoul * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq) {
numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift);
e_coul += e;
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}

78
lib/gpu/lal_coul_dsf.h Normal file
View File

@ -0,0 +1,78 @@
/***************************************************************************
coul_dsf.h
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the coul/dsf pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : 8/15/2012
email : nguyentdw@ornl.gov
***************************************************************************/
#ifndef LAL_LJ_DSF_H
#define LAL_LJ_DSF_H
#include "lal_base_charge.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class CoulDSF : public BaseCharge<numtyp, acctyp> {
public:
CoulDSF();
~CoulDSF();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split, FILE *screen,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double e_shift, const double f_shift,
const double alpha);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _qqrd2e;
private:
bool _allocated;
numtyp _e_shift, _f_shift, _alpha, _cut_coulsq;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

View File

@ -0,0 +1,125 @@
/***************************************************************************
coul_dsf_ext.cpp
-------------------
Trung Dac Nguyen (ORNL)
Functions for LAMMPS access to coul/dsf acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : 8/15/2012
email : nguyentd@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lal_coul_dsf.h"
using namespace std;
using namespace LAMMPS_AL;
static CoulDSF<PRECISION,ACC_PRECISION> CDMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double e_shift, const double f_shift,
const double alpha) {
CDMF.clear();
gpu_mode=CDMF.device->gpu_mode();
double gpu_split=CDMF.device->particle_split();
int first_gpu=CDMF.device->first_device();
int last_gpu=CDMF.device->last_device();
int world_me=CDMF.device->world_me();
int gpu_rank=CDMF.device->gpu_rank();
int procs_per_gpu=CDMF.device->procs_per_gpu();
CDMF.device->init_message(screen,"coul/dsf",first_gpu,last_gpu);
bool message=false;
if (CDMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size,
gpu_split, screen, host_cut_coulsq, host_special_coul,
qqrd2e, e_shift, f_shift, alpha);
CDMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size,
gpu_split, screen, host_cut_coulsq, host_special_coul,
qqrd2e, e_shift, f_shift, alpha);
CDMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
CDMF.estimate_gpu_overhead();
return init_ok;
}
void cdsf_gpu_clear() {
CDMF.clear();
}
int** cdsf_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return CDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void cdsf_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
CDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
vflag,eatom,vatom,host_start,cpu_time,success,host_q,
nlocal,boxlo,prd);
}
double cdsf_gpu_bytes() {
return CDMF.host_memory_usage();
}

170
lib/gpu/lal_dipole_lj.cpp Normal file
View File

@ -0,0 +1,170 @@
/***************************************************************************
dipole_lj.cpp
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the dipole/cut pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "dipole_lj_cl.h"
#elif defined(USE_CUDART)
const char *dipole_lj=0;
#else
#include "dipole_lj_cubin.h"
#endif
#include "lal_dipole_lj.h"
#include <cassert>
using namespace LAMMPS_AL;
#define DipoleLJT DipoleLJ<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
DipoleLJT::DipoleLJ() : BaseDipole<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
DipoleLJT::~DipoleLJ() {
clear();
}
template <class numtyp, class acctyp>
int DipoleLJT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int DipoleLJT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq, double **host_cut_coulsq,
double *host_special_coul, const double qqrd2e) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,dipole_lj,"k_dipole_lj");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cut_ljsq, host_cut_coulsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i];
host_write[i+4]=host_special_coul[i];
}
ucl_copy(sp_lj,host_write,8,false);
_qqrd2e=qqrd2e;
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void DipoleLJT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
cutsq.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double DipoleLJT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(DipoleLJ<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void DipoleLJT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
&this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q,
&this->atom->quat, &cutsq,
&_qqrd2e, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &lj1, &lj3,
&_lj_types, &sp_lj, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->ans->force,
&this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->atom->q,
&this->atom->quat, &cutsq,
&_qqrd2e, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class DipoleLJ<PRECISION,ACC_PRECISION>;

501
lib/gpu/lal_dipole_lj.cu Normal file
View File

@ -0,0 +1,501 @@
// **************************************************************************
// dipole_lj.cu
// -------------------
// Trung Dac Nguyen (ORNL)
//
// Device code for acceleration of the dipole/cut pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
__local acctyp red_acc[8][BLOCK_PAIR]; \
red_acc[0][tid]=f.x; \
red_acc[1][tid]=f.y; \
red_acc[2][tid]=f.z; \
red_acc[3][tid]=tor.x; \
red_acc[4][tid]=tor.y; \
red_acc[5][tid]=tor.z; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<6; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
f.x=red_acc[0][tid]; \
f.y=red_acc[1][tid]; \
f.z=red_acc[2][tid]; \
tor.x=red_acc[3][tid]; \
tor.y=red_acc[4][tid]; \
tor.z=red_acc[5][tid]; \
if (eflag>0 || vflag>0) { \
for (int r=0; r<6; r++) \
red_acc[r][tid]=virial[r]; \
red_acc[6][tid]=energy; \
red_acc[7][tid]=ecoul; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<8; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
for (int r=0; r<6; r++) \
virial[r]=red_acc[r][tid]; \
energy=red_acc[6][tid]; \
ecoul=red_acc[7][tid]; \
} \
} \
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv=energy; \
engv+=inum; \
*engv=e_coul; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
engv+=inum; \
} \
} \
ans[ii]=f; \
ans[ii+inum]=tor; \
}
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
texture<float4> mu_tex;
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
texture<int4,1> mu_tex;
#endif
#else
#define pos_tex x_
#define q_tex q_
#define mu_tex mu_
#endif
__kernel void k_dipole_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_ ,
__global numtyp4 *mu_,
__global numtyp *cutsq, const numtyp qqrd2e,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp4 tor;
tor.x=(acctyp)0;
tor.y=(acctyp)0;
tor.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
numtyp qj; fetch(qj,j,q_tex);
numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<cutsq[mtype]) {
numtyp r2inv=ucl_recip(rsq);
numtyp force_lj, r6inv;
numtyp rinv, r3inv, r5inv, r7inv;
numtyp pre1, pre2, pre3, pre4;
numtyp pdotp, pidotr, pjdotr;
acctyp4 forcecoul, ticoul;
acctyp4 force;
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
if (rsq < lj1[mtype].z) {
r6inv = r2inv*r2inv*r2inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
} else force_lj = (numtyp)0.0;
if (rsq < lj1[mtype].w) {
rinv = ucl_rsqrt(rsq);
// charge-charge
if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) {
r3inv = r2inv*rinv;
pre1 = qtmp*qj*r3inv;
forcecoul.x += pre1*delx;
forcecoul.y += pre1*dely;
forcecoul.z += pre1*delz;
}
// dipole-dipole
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
r3inv = r2inv*rinv;
r5inv = r3inv*r2inv;
r7inv = r5inv*r2inv;
pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
pre1 = (numtyp)3.0*r5inv*pdotp - (numtyp)15.0*r7inv*pidotr*pjdotr;
pre2 = (numtyp)3.0*r5inv*pjdotr;
pre3 = (numtyp)3.0*r5inv*pidotr;
pre4 = (numtyp)(-1.0)*r3inv;
forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x;
forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y;
forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z;
numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
}
// dipole-charge
if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
r3inv = r2inv*rinv;
r5inv = r3inv*r2inv;
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
pre1 = (numtyp)3.0*qj*r5inv * pidotr;
pre2 = qj*r3inv;
forcecoul.x += pre2*mui.x - pre1*delx;
forcecoul.y += pre2*mui.y - pre1*dely;
forcecoul.z += pre2*mui.z - pre1*delz;
ticoul.x += pre2 * (mui.y*delz - mui.z*dely);
ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
}
// charge-dipole
if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
r3inv = r2inv*rinv;
r5inv = r3inv*r2inv;
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr;
pre2 = qtmp*r3inv;
forcecoul.x += pre1*delx - pre2*muj.x;
forcecoul.y += pre1*dely - pre2*muj.y;
forcecoul.z += pre1*delz - pre2*muj.z;
}
} else {
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
}
numtyp fq = factor_coul*qqrd2e;
force.x = fq*forcecoul.x + delx*force_lj;
force.y = fq*forcecoul.y + dely*force_lj;
force.z = fq*forcecoul.z + delz*force_lj;
f.x+=force.x;
f.y+=force.y;
f.z+=force.z;
tor.x+=fq*ticoul.x;
tor.y+=fq*ticoul.y;
tor.z+=fq*ticoul.z;
if (eflag>0) {
acctyp e = (acctyp)0.0;
if (rsq < lj1[mtype].w) {
e = qtmp*qj*rinv;
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr;
if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0)
e += -qj*r3inv*pidotr;
if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
e += qtmp*r3inv*pjdotr;
e *= fq;
} else e = (acctyp)0.0;
e_coul += e;
if (rsq < lj1[mtype].z) {
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
}
if (vflag>0) {
virial[0] += delx*force.x;
virial[1] += dely*force.y;
virial[2] += delz*force.z;
virial[3] += delx*force.y;
virial[4] += delx*force.z;
virial[5] += dely*force.z;
}
}
} // for nbor
store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}
__kernel void k_dipole_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp4 *mu_,
__global numtyp *_cutsq, const numtyp qqrd2e,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
cutsq[tid]=_cutsq[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp4 tor;
tor.x=(acctyp)0;
tor.y=(acctyp)0;
tor.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
numtyp qj; fetch(qj,j,q_tex);
numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[mtype]) {
numtyp r2inv=ucl_recip(rsq);
numtyp force_lj, r6inv;
numtyp rinv, r3inv, r5inv, r7inv;
numtyp pre1, pre2, pre3, pre4;
numtyp pdotp, pidotr, pjdotr;
acctyp4 forcecoul, ticoul;
acctyp4 force;
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
if (rsq < lj1[mtype].z) {
r6inv = r2inv*r2inv*r2inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
} else force_lj = (numtyp)0.0;
if (rsq < lj1[mtype].w) {
rinv = ucl_rsqrt(rsq);
// charge-charge
if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) {
r3inv = r2inv*rinv;
pre1 = qtmp*qj*r3inv;
forcecoul.x += pre1*delx;
forcecoul.y += pre1*dely;
forcecoul.z += pre1*delz;
}
// dipole-dipole
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
r3inv = r2inv*rinv;
r5inv = r3inv*r2inv;
r7inv = r5inv*r2inv;
pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
pre1 = (numtyp)3.0*r5inv*pdotp - (numtyp)15.0*r7inv*pidotr*pjdotr;
pre2 = (numtyp)3.0*r5inv*pjdotr;
pre3 = (numtyp)3.0*r5inv*pidotr;
pre4 = (numtyp)(-1.0)*r3inv;
forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x;
forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y;
forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z;
numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
}
// dipole-charge
if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
r3inv = r2inv*rinv;
r5inv = r3inv*r2inv;
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
pre1 = (numtyp)3.0*qj*r5inv * pidotr;
pre2 = qj*r3inv;
forcecoul.x += pre2*mui.x - pre1*delx;
forcecoul.y += pre2*mui.y - pre1*dely;
forcecoul.z += pre2*mui.z - pre1*delz;
ticoul.x += pre2 * (mui.y*delz - mui.z*dely);
ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
}
// charge-dipole
if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
r3inv = r2inv*rinv;
r5inv = r3inv*r2inv;
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr;
pre2 = qtmp*r3inv;
forcecoul.x += pre1*delx - pre2*muj.x;
forcecoul.y += pre1*dely - pre2*muj.y;
forcecoul.z += pre1*delz - pre2*muj.z;
}
} else {
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
}
numtyp fq = factor_coul*qqrd2e;
force.x = fq*forcecoul.x + delx*force_lj;
force.y = fq*forcecoul.y + dely*force_lj;
force.z = fq*forcecoul.z + delz*force_lj;
f.x+=force.x;
f.y+=force.y;
f.z+=force.z;
tor.x+=fq*ticoul.x;
tor.y+=fq*ticoul.y;
tor.z+=fq*ticoul.z;
if (eflag>0) {
acctyp e = (acctyp)0;
if (rsq < lj1[mtype].w) {
e = qtmp*qj*rinv;
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr;
if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0)
e += -qj*r3inv*pidotr;
if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
e += qtmp*r3inv*pjdotr;
e *= fq;
} else e = (acctyp)0;
e_coul += e;
if (rsq < lj1[mtype].z) {
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
}
if (vflag>0) {
virial[0] += delx*force.x;
virial[1] += dely*force.y;
virial[2] += delz*force.z;
virial[3] += delx*force.y;
virial[4] += delx*force.z;
virial[5] += dely*force.z;
}
}
} // for nbor
store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}

85
lib/gpu/lal_dipole_lj.h Normal file
View File

@ -0,0 +1,85 @@
/***************************************************************************
dipole_lj.h
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the dipole/cut pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_DIPOLE_LJ_H
#define LAL_DIPOLE_LJ_H
#include "lal_base_dipole.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class DipoleLJ : public BaseDipole<numtyp, acctyp> {
public:
DipoleLJ();
~DipoleLJ();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
double **host_cut_coulsq, double *host_special_coul,
const double qqrd2e);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// cutsq
UCL_D_Vec<numtyp> cutsq;
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _qqrd2e;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

View File

@ -0,0 +1,128 @@
/***************************************************************************
dipole_lj_ext.cpp
-------------------
Trung Dac Nguyen (ORNL)
Functions for LAMMPS access to dipole/cut acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lal_dipole_lj.h"
using namespace std;
using namespace LAMMPS_AL;
static DipoleLJ<PRECISION,ACC_PRECISION> DPLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double **host_cut_coulsq,
double *host_special_coul, const double qqrd2e) {
DPLMF.clear();
gpu_mode=DPLMF.device->gpu_mode();
double gpu_split=DPLMF.device->particle_split();
int first_gpu=DPLMF.device->first_device();
int last_gpu=DPLMF.device->last_device();
int world_me=DPLMF.device->world_me();
int gpu_rank=DPLMF.device->gpu_rank();
int procs_per_gpu=DPLMF.device->procs_per_gpu();
DPLMF.device->init_message(screen,"dipole/cut",first_gpu,last_gpu);
bool message=false;
if (DPLMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e);
DPLMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e);
DPLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
DPLMF.estimate_gpu_overhead();
return init_ok;
}
void dpl_gpu_clear() {
DPLMF.clear();
}
int** dpl_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double **host_mu,
double *boxlo, double *prd) {
return DPLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, host_mu, boxlo, prd);
}
void dpl_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
double **host_mu, const int nlocal, double *boxlo, double *prd) {
DPLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
vflag,eatom,vatom,host_start,cpu_time,success,host_q,host_mu,
nlocal,boxlo,prd);
}
double dpl_gpu_bytes() {
return DPLMF.host_memory_usage();
}

View File

@ -0,0 +1,170 @@
/***************************************************************************
dipole_lj_sf.cpp
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the dipole/sf pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "dipole_lj_sf_cl.h"
#elif defined(USE_CUDART)
const char *dipole_lj_sf=0;
#else
#include "dipole_lj_sf_cubin.h"
#endif
#include "lal_dipole_lj_sf.h"
#include <cassert>
using namespace LAMMPS_AL;
#define DipoleLJSFT DipoleLJSF<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
DipoleLJSFT::DipoleLJSF() : BaseDipole<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
DipoleLJSFT::~DipoleLJSF() {
clear();
}
template <class numtyp, class acctyp>
int DipoleLJSFT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int DipoleLJSFT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq, double **host_cut_coulsq,
double *host_special_coul, const double qqrd2e) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,dipole_lj_sf,"k_dipole_lj_sf");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cut_ljsq, host_cut_coulsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_cutsq);
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i];
host_write[i+4]=host_special_coul[i];
}
ucl_copy(sp_lj,host_write,8,false);
_qqrd2e=qqrd2e;
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void DipoleLJSFT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
cutsq.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double DipoleLJSFT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(DipoleLJSF<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
&this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q,
&this->atom->quat, &cutsq,
&_qqrd2e, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &lj1, &lj3,
&_lj_types, &sp_lj, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q,
&this->atom->quat, &cutsq,
&_qqrd2e, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class DipoleLJSF<PRECISION,ACC_PRECISION>;

562
lib/gpu/lal_dipole_lj_sf.cu Normal file
View File

@ -0,0 +1,562 @@
// **************************************************************************
// dipole_lj_sf.cu
// -------------------
// Trung Dac Nguyen (ORNL)
//
// Device code for acceleration of the dipole/sf pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
__local acctyp red_acc[8][BLOCK_PAIR]; \
red_acc[0][tid]=f.x; \
red_acc[1][tid]=f.y; \
red_acc[2][tid]=f.z; \
red_acc[3][tid]=tor.x; \
red_acc[4][tid]=tor.y; \
red_acc[5][tid]=tor.z; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<6; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
f.x=red_acc[0][tid]; \
f.y=red_acc[1][tid]; \
f.z=red_acc[2][tid]; \
tor.x=red_acc[3][tid]; \
tor.y=red_acc[4][tid]; \
tor.z=red_acc[5][tid]; \
if (eflag>0 || vflag>0) { \
for (int r=0; r<6; r++) \
red_acc[r][tid]=virial[r]; \
red_acc[6][tid]=energy; \
red_acc[7][tid]=ecoul; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<8; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
for (int r=0; r<6; r++) \
virial[r]=red_acc[r][tid]; \
energy=red_acc[6][tid]; \
ecoul=red_acc[7][tid]; \
} \
} \
if (offset==0) { \
engv+=ii; \
if (eflag>0) { \
*engv=energy; \
engv+=inum; \
*engv=e_coul; \
engv+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
*engv=virial[i]; \
engv+=inum; \
} \
} \
ans[ii]=f; \
ans[ii+inum]=tor; \
}
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
texture<float4> mu_tex;
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
texture<int4,1> mu_tex;
#endif
#else
#define pos_tex x_
#define q_tex q_
#define mu_tex mu_
#endif
__kernel void k_dipole_lj_sf(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_ ,
__global numtyp4 *mu_,
__global numtyp *cutsq, const numtyp qqrd2e,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp4 tor;
tor.x=(acctyp)0;
tor.y=(acctyp)0;
tor.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
numtyp qj; fetch(qj,j,q_tex);
numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<cutsq[mtype]) {
numtyp r2inv=ucl_recip(rsq);
numtyp force_lj, r6inv;
numtyp rinv, r3inv, r5inv;
numtyp pre1, pre2, pre4;
numtyp pdotp, pidotr, pjdotr;
numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
numtyp4 aforcecoul, bforcecoul;
acctyp4 forcecoul, ticoul;
acctyp4 force;
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
if (rsq < lj1[mtype].z) {
r6inv = r2inv*r2inv*r2inv;
numtyp forceljcut = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
rcutlj2inv = ucl_recip(lj1[mtype].z);
rcutlj6inv = rcutlj2inv * rcutlj2inv * rcutlj2inv;
numtyp forceljsf = rcutlj6inv*(lj1[mtype].x*rcutlj6inv-lj1[mtype].y)*rcutlj2inv;
force_lj = factor_lj * (forceljcut - forceljsf);
} else force_lj = (numtyp)0.0;
if (rsq < lj1[mtype].w) {
rinv = ucl_rsqrt(rsq);
rcutcoul2inv = ucl_recip(lj1[mtype].w);
// charge-charge
if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) {
r3inv = r2inv*rinv;
pre1 = qtmp*qj*rinv*(r2inv-rcutcoul2inv);
forcecoul.x += pre1*delx;
forcecoul.y += pre1*dely;
forcecoul.z += pre1*delz;
}
// dipole-dipole
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
r3inv = r2inv*rinv;
r5inv = r3inv*r2inv;
pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv;
pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr);
aforcecoul.x = pre1*delx;
aforcecoul.y = pre1*dely;
aforcecoul.z = pre1*delz;
bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+
(numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv;
presf = (numtyp)2.0*r2inv*pidotr*pjdotr;
bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx);
bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely);
bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz);
forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x);
forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y);
forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z);
pre2 = (numtyp)3.0*bfac*r5inv*pjdotr;
pre4 = -bfac*r3inv;
numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
}
// dipole-charge
if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
r3inv = r2inv*rinv;
r5inv = r3inv*r2inv;
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
rcutcoul2inv=ucl_recip(lj1[mtype].w);
pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv);
pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv +
(numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
pre2 = qj*r3inv * pqfac;
forcecoul.x += pre2*mui.x - pre1*delx;
forcecoul.y += pre2*mui.y - pre1*dely;
forcecoul.z += pre2*mui.z - pre1*delz;
ticoul.x += pre2 * (mui.y*delz - mui.z*dely);
ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
}
// charge-dipole
if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
r3inv = r2inv*rinv;
r5inv = r3inv*r2inv;
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
rcutcoul2inv=ucl_recip(lj1[mtype].w);
pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv);
qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv +
(numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
pre2 = qtmp*r3inv * qpfac;
forcecoul.x += pre1*delx - pre2*muj.x;
forcecoul.y += pre1*dely - pre2*muj.y;
forcecoul.z += pre1*delz - pre2*muj.z;
}
} else {
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
}
numtyp fq = factor_coul*qqrd2e;
force.x = fq*forcecoul.x + delx*force_lj;
force.y = fq*forcecoul.y + dely*force_lj;
force.z = fq*forcecoul.z + delz*force_lj;
f.x+=force.x;
f.y+=force.y;
f.z+=force.z;
tor.x+=fq*ticoul.x;
tor.y+=fq*ticoul.y;
tor.z+=fq*ticoul.z;
if (eflag>0) {
acctyp e = (acctyp)0.0;
if (rsq < lj1[mtype].w) {
numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv);
e = qtmp*qj*rinv*fac*fac;
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr);
if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0)
e += -qj*r3inv*pidotr * pqfac;
if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
e += qtmp*r3inv*pjdotr * qpfac;
e *= fq;
} else e = (acctyp)0.0;
e_coul += e;
if (rsq < lj1[mtype].z) {
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) +
rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv -
(numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv +
rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv +
(numtyp)4.0*lj3[mtype].y);
energy+=factor_lj*e;
}
}
if (vflag>0) {
virial[0] += delx*force.x;
virial[1] += dely*force.y;
virial[2] += delz*force.z;
virial[3] += delx*force.y;
virial[4] += delx*force.z;
virial[5] += dely*force.z;
}
}
} // for nbor
store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}
__kernel void k_dipole_lj_sf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp4 *mu_,
__global numtyp *_cutsq, const numtyp qqrd2e,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
cutsq[tid]=_cutsq[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp4 tor;
tor.x=(acctyp)0;
tor.y=(acctyp)0;
tor.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
numtyp qj; fetch(qj,j,q_tex);
numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[mtype]) {
numtyp r2inv=ucl_recip(rsq);
numtyp force_lj, r6inv;
numtyp rinv, r3inv, r5inv;
numtyp pre1, pre2, pre4;
numtyp pdotp, pidotr, pjdotr;
numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
numtyp4 aforcecoul, bforcecoul;
acctyp4 forcecoul, ticoul;
acctyp4 force;
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
if (rsq < lj1[mtype].z) {
r6inv = r2inv*r2inv*r2inv;
numtyp forceljcut = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
rcutlj2inv = ucl_recip(lj1[mtype].z);
rcutlj6inv = rcutlj2inv * rcutlj2inv * rcutlj2inv;
numtyp forceljsf = rcutlj6inv*(lj1[mtype].x*rcutlj6inv-lj1[mtype].y)*rcutlj2inv;
force_lj = factor_lj * (forceljcut - forceljsf);
} else force_lj = (numtyp)0.0;
if (rsq < lj1[mtype].w) {
rinv = ucl_rsqrt(rsq);
rcutcoul2inv = ucl_recip(lj1[mtype].w);
// charge-charge
if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) {
r3inv = r2inv*rinv;
pre1 = qtmp*qj*rinv*(r2inv-rcutcoul2inv);
forcecoul.x += pre1*delx;
forcecoul.y += pre1*dely;
forcecoul.z += pre1*delz;
}
// dipole-dipole
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
r3inv = r2inv*rinv;
r5inv = r3inv*r2inv;
pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv;
pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr);
aforcecoul.x = pre1*delx;
aforcecoul.y = pre1*dely;
aforcecoul.z = pre1*delz;
bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+
(numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv;
presf = (numtyp)2.0*r2inv*pidotr*pjdotr;
bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx);
bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely);
bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz);
forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x);
forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y);
forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z);
pre2 = (numtyp)3.0*bfac*r5inv*pjdotr;
pre4 = -bfac*r3inv;
numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
}
// dipole-charge
if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
r3inv = r2inv*rinv;
r5inv = r3inv*r2inv;
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv);
pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv +
(numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
pre2 = qj*r3inv * pqfac;
forcecoul.x += pre2*mui.x - pre1*delx;
forcecoul.y += pre2*mui.y - pre1*dely;
forcecoul.z += pre2*mui.z - pre1*delz;
ticoul.x += pre2 * (mui.y*delz - mui.z*dely);
ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
}
// charge-dipole
if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
r3inv = r2inv*rinv;
r5inv = r3inv*r2inv;
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv);
qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv +
(numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
pre2 = qtmp*r3inv * qpfac;
forcecoul.x += pre1*delx - pre2*muj.x;
forcecoul.y += pre1*dely - pre2*muj.y;
forcecoul.z += pre1*delz - pre2*muj.z;
}
} else {
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
}
numtyp fq = factor_coul*qqrd2e;
force.x = fq*forcecoul.x + delx*force_lj;
force.y = fq*forcecoul.y + dely*force_lj;
force.z = fq*forcecoul.z + delz*force_lj;
f.x+=force.x;
f.y+=force.y;
f.z+=force.z;
tor.x+=fq*ticoul.x;
tor.y+=fq*ticoul.y;
tor.z+=fq*ticoul.z;
if (eflag>0) {
acctyp e = (acctyp)0.0;
if (rsq < lj1[mtype].w) {
numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv);
e = qtmp*qj*rinv*fac*fac;
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr);
if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0)
e += -qj*r3inv*pidotr * pqfac;
if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
e += qtmp*r3inv*pjdotr * qpfac;
e *= fq;
} else e = (acctyp)0.0;
e_coul += e;
if (rsq < lj1[mtype].z) {
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) +
rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv -
(numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv +
rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv +
(numtyp)4.0*lj3[mtype].y);
energy+=factor_lj*e;
}
}
if (vflag>0) {
virial[0] += delx*force.x;
virial[1] += dely*force.y;
virial[2] += delz*force.z;
virial[3] += delx*force.y;
virial[4] += delx*force.z;
virial[5] += dely*force.z;
}
}
} // for nbor
store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}

View File

@ -0,0 +1,85 @@
/***************************************************************************
dipole_lj_sf.h
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the dipole/sf pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_DIPOLE_LJ_SF_H
#define LAL_DIPOLE_LJ_SF_H
#include "lal_base_dipole.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class DipoleLJSF : public BaseDipole<numtyp, acctyp> {
public:
DipoleLJSF();
~DipoleLJSF();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
double **host_cut_coulsq, double *host_special_coul,
const double qqrd2e);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = cutsq
UCL_D_Vec<numtyp4> lj3;
/// cutsq
UCL_D_Vec<numtyp> cutsq;
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _qqrd2e;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

View File

@ -0,0 +1,128 @@
/***************************************************************************
dipole_lj_sf_ext.cpp
-------------------
Trung Dac Nguyen (ORNL)
Functions for LAMMPS access to dipole/sf acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lal_dipole_lj_sf.h"
using namespace std;
using namespace LAMMPS_AL;
static DipoleLJSF<PRECISION,ACC_PRECISION> DPLSFMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double **host_cut_coulsq,
double *host_special_coul, const double qqrd2e) {
DPLSFMF.clear();
gpu_mode=DPLSFMF.device->gpu_mode();
double gpu_split=DPLSFMF.device->particle_split();
int first_gpu=DPLSFMF.device->first_device();
int last_gpu=DPLSFMF.device->last_device();
int world_me=DPLSFMF.device->world_me();
int gpu_rank=DPLSFMF.device->gpu_rank();
int procs_per_gpu=DPLSFMF.device->procs_per_gpu();
DPLSFMF.device->init_message(screen,"dipole/sf",first_gpu,last_gpu);
bool message=false;
if (DPLSFMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e);
DPLSFMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e);
DPLSFMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
DPLSFMF.estimate_gpu_overhead();
return init_ok;
}
void dplsf_gpu_clear() {
DPLSFMF.clear();
}
int** dplsf_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double **host_mu,
double *boxlo, double *prd) {
return DPLSFMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, host_mu, boxlo, prd);
}
void dplsf_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
double **host_mu, const int nlocal, double *boxlo, double *prd) {
DPLSFMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
vflag,eatom,vatom,host_start,cpu_time,success,host_q,host_mu,
nlocal,boxlo,prd);
}
double dplsf_gpu_bytes() {
return DPLSFMF.host_memory_usage();
}

146
lib/gpu/lal_gauss.cpp Normal file
View File

@ -0,0 +1,146 @@
/***************************************************************************
gauss.cpp
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the gauss pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "gauss_cl.h"
#elif defined(USE_CUDART)
const char *gauss=0;
#else
#include "gauss_cubin.h"
#endif
#include "lal_gauss.h"
#include <cassert>
using namespace LAMMPS_AL;
#define GaussT Gauss<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
GaussT::Gauss() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
}
template <class numtyp, class acctyp>
GaussT::~Gauss() {
clear();
}
template <class numtyp, class acctyp>
int GaussT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int GaussT::init(const int ntypes,
double **host_cutsq, double **host_a,
double **host_b, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,gauss,"k_gauss");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
gauss1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,gauss1,host_write,host_a,host_b,
host_cutsq,host_offset);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=gauss1.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void GaussT::clear() {
if (!_allocated)
return;
_allocated=false;
gauss1.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double GaussT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(Gauss<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void GaussT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &gauss1, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &gauss1, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class Gauss<PRECISION,ACC_PRECISION>;

189
lib/gpu/lal_gauss.cu Normal file
View File

@ -0,0 +1,189 @@
// **************************************************************************
// gauss.cu
// -------------------
// Trung Dac Nguyen (ORNL)
//
// Device code for acceleration of the gauss pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
#else
texture<int4,1> pos_tex;
#endif
#else
#define pos_tex x_
#endif
__kernel void k_gauss(__global numtyp4 *x_, __global numtyp4 *gauss1,
const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<gauss1[mtype].z) {
numtyp r2inv = ucl_recip(rsq);
numtyp r = ucl_sqrt(rsq);
numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq*
ucl_exp(-gauss1[mtype].y*rsq)*r2inv*factor_lj;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) -
gauss1[mtype].w);
energy+=factor_lj*e;
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv);
} // if ii
}
__kernel void k_gauss_fast(__global numtyp4 *x_, __global numtyp4 *gauss1_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 gauss1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (tid<4)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
gauss1[tid]=gauss1_in[tid];
}
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<gauss1[mtype].z) {
numtyp r2inv = ucl_recip(rsq);
numtyp r = ucl_sqrt(rsq);
numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq*
ucl_exp(-gauss1[mtype].y*rsq)*r2inv*factor_lj;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) -
gauss1[mtype].w);
energy+=factor_lj*e;
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv);
} // if ii
}

77
lib/gpu/lal_gauss.h Normal file
View File

@ -0,0 +1,77 @@
/***************************************************************************
gauss.h
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the gauss pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_GAUSS_H
#define LAL_GAYSS_H
#include "lal_base_atomic.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class Gauss : public BaseAtomic<numtyp, acctyp> {
public:
Gauss();
~Gauss();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq,
double **host_a, double **host_b, double **host_offset,
double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// gauss1.x = a, gauss1.y = b, gauss1.z = cutsq, gauss1.w = offset
UCL_D_Vec<numtyp4> gauss1;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

120
lib/gpu/lal_gauss_ext.cpp Normal file
View File

@ -0,0 +1,120 @@
/***************************************************************************
gauss_ext.cpp
-------------------
Trung Dac Nguyen (ORNL)
Functions for LAMMPS access to gauss acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lal_gauss.h"
using namespace std;
using namespace LAMMPS_AL;
static Gauss<PRECISION,ACC_PRECISION> GLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
double **host_b, double **offset, double *special_lj,
const int inum, const int nall, const int max_nbors,
const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) {
GLMF.clear();
gpu_mode=GLMF.device->gpu_mode();
double gpu_split=GLMF.device->particle_split();
int first_gpu=GLMF.device->first_device();
int last_gpu=GLMF.device->last_device();
int world_me=GLMF.device->world_me();
int gpu_rank=GLMF.device->gpu_rank();
int procs_per_gpu=GLMF.device->procs_per_gpu();
GLMF.device->init_message(screen,"gauss",first_gpu,last_gpu);
bool message=false;
if (GLMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=GLMF.init(ntypes, cutsq, host_a, host_b,
offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen);
GLMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=GLMF.init(ntypes, cutsq, host_a, host_b,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen);
GLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
GLMF.estimate_gpu_overhead();
return init_ok;
}
void gauss_gpu_clear() {
GLMF.clear();
}
int ** gauss_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success) {
return GLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success);
}
void gauss_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success) {
GLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
}
double gauss_gpu_bytes() {
return GLMF.host_memory_usage();
}

View File

@ -0,0 +1,168 @@
/***************************************************************************
lj_coul_debye.cpp
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the lj/cut/coul/debye pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "lj_coul_debye_cl.h"
#elif defined(USE_CUDART)
const char *lj_coul_debye=0;
#else
#include "lj_coul_debye_cubin.h"
#endif
#include "lal_lj_coul_debye.h"
#include <cassert>
using namespace LAMMPS_AL;
#define LJCoulDebyeT LJCoulDebye<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
LJCoulDebyeT::LJCoulDebye() : BaseCharge<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
LJCoulDebyeT::~LJCoulDebye() {
clear();
}
template <class numtyp, class acctyp>
int LJCoulDebyeT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int LJCoulDebyeT::init(const int ntypes,
double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq, double **host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double kappa) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj_coul_debye,"k_lj_debye");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cut_ljsq, host_cut_coulsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i];
host_write[i+4]=host_special_coul[i];
}
ucl_copy(sp_lj,host_write,8,false);
_qqrd2e=qqrd2e;
_kappa=kappa;
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void LJCoulDebyeT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
cutsq.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double LJCoulDebyeT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(LJCoulDebye<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q, &cutsq,
&_qqrd2e, &_kappa, &this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->atom->q, &cutsq,
&_qqrd2e, &_kappa, &this->_threads_per_atom);
}
this->time_pair.stop();
}
template class LJCoulDebye<PRECISION,ACC_PRECISION>;

View File

@ -0,0 +1,256 @@
// **************************************************************************
// lj_coul_debye.cu
// -------------------
// Trung Dac Nguyen (ORNL)
//
// Device code for acceleration of the lj/cut/coul/debye pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
#endif
#else
#define pos_tex x_
#define q_tex q_
#endif
__kernel void k_lj_debye_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_ ,
__global numtyp *cutsq, const numtyp qqrd2e,
const numtyp kappa,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<cutsq[mtype]) {
numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, force_lj, force, r6inv, r, rinv, screening;
if (rsq < lj1[mtype].z) {
r6inv = r2inv*r2inv*r2inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
} else
force_lj = (numtyp)0.0;
if (rsq < lj1[mtype].w) {
r = ucl_sqrt(rsq);
rinv = ucl_recip(r);
fetch(screening,j,q_tex);
screening *= ucl_exp(-kappa*r);
forcecoul = qqrd2e*qtmp*(kappa+rinv)*screening*factor_coul;
}
else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < lj1[mtype].z) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (rsq < lj1[mtype].w) {
e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul;
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}
__kernel void k_lj_debye_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in,
__global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
__global numtyp *_cutsq, const numtyp qqrd2e,
const numtyp kappa,
const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
cutsq[tid]=_cutsq[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul;
factor_lj = sp_lj[sbmask(j)];
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[mtype]) {
numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, force_lj, force, r6inv, r, rinv, screening;
if (rsq < lj1[mtype].z) {
r6inv = r2inv*r2inv*r2inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
} else
force_lj = (numtyp)0.0;
if (rsq < lj1[mtype].w) {
r = ucl_sqrt(rsq);
rinv = ucl_recip(r);
fetch(screening,j,q_tex);
screening *= ucl_exp(-kappa*r);
forcecoul = qqrd2e*qtmp*(kappa+rinv)*screening*factor_coul;
}
else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < lj1[mtype].z) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
if (rsq < lj1[mtype].w) {
e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul;
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}

View File

@ -0,0 +1,85 @@
/***************************************************************************
lj_coul_debye.h
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the lj/cut/coul/debye pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_LJ_COUL_DEBYE_H
#define LAL_LJ_COUL_DEBYE_H
#include "lal_base_charge.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class LJCoulDebye : public BaseCharge<numtyp, acctyp> {
public:
LJCoulDebye();
~LJCoulDebye();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
double **host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double kappa);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// cutsq
UCL_D_Vec<numtyp> cutsq;
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _qqrd2e,_kappa;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

View File

@ -0,0 +1,129 @@
/***************************************************************************
lj_coul_debye_ext.cpp
-------------------
Trung Dac Nguyen (ORNL)
Functions for LAMMPS access to lj/cut/coul/debye acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lal_lj_coul_debye.h"
using namespace std;
using namespace LAMMPS_AL;
static LJCoulDebye<PRECISION,ACC_PRECISION> LJCDMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, double **host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double kappa) {
LJCDMF.clear();
gpu_mode=LJCDMF.device->gpu_mode();
double gpu_split=LJCDMF.device->particle_split();
int first_gpu=LJCDMF.device->first_device();
int last_gpu=LJCDMF.device->last_device();
int world_me=LJCDMF.device->world_me();
int gpu_rank=LJCDMF.device->gpu_rank();
int procs_per_gpu=LJCDMF.device->procs_per_gpu();
LJCDMF.device->init_message(screen,"lj/cut/coul/debye",first_gpu,last_gpu);
bool message=false;
if (LJCDMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, kappa);
LJCDMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, kappa);
LJCDMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
LJCDMF.estimate_gpu_overhead();
return init_ok;
}
void ljcd_gpu_clear() {
LJCDMF.clear();
}
int** ljcd_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return LJCDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void ljcd_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
LJCDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
vflag,eatom,vatom,host_start,cpu_time,success,host_q,
nlocal,boxlo,prd);
}
double ljcd_gpu_bytes() {
return LJCDMF.host_memory_usage();
}

168
lib/gpu/lal_lj_dsf.cpp Normal file
View File

@ -0,0 +1,168 @@
/***************************************************************************
lj_dsf.cpp
-------------------
W. Michael Brown (ORNL)
Class for acceleration of the lj/cut/coul/dsf pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : 7/12/2012
email : brownw@ornl.gov
***************************************************************************/
#if defined(USE_OPENCL)
#include "lj_dsf_cl.h"
#elif defined(USE_CUDART)
const char *lj_dsf=0;
#else
#include "lj_dsf_cubin.h"
#endif
#include "lal_lj_dsf.h"
#include <cassert>
using namespace LAMMPS_AL;
#define LJDSFT LJDSF<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
LJDSFT::LJDSF() : BaseCharge<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
LJDSFT::~LJDSF() {
clear();
}
template <class numtyp, class acctyp>
int LJDSFT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double **host_cut_ljsq, const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double e_shift, const double f_shift,
const double alpha) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,lj_dsf,"k_lj_dsf");
if (success!=0)
return success;
_cut_coulsq=host_cut_coulsq;
_e_shift=e_shift;
_f_shift=f_shift;
_alpha=alpha;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types; i++)
host_write[i]=0.0;
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cut_ljsq, host_cutsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {
host_write[i]=host_special_lj[i];
host_write[i+4]=host_special_coul[i];
}
ucl_copy(sp_lj,host_write,8,false);
_qqrd2e=qqrd2e;
_allocated=true;
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void LJDSFT::clear() {
if (!_allocated)
return;
_allocated=false;
lj1.clear();
lj3.clear();
sp_lj.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double LJDSFT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(LJDSF<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void LJDSFT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag,
&vflag, &ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
&this->_threads_per_atom);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
&_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
&this->_threads_per_atom);
}
this->time_pair.stop();
}
template class LJDSF<PRECISION,ACC_PRECISION>;

261
lib/gpu/lal_lj_dsf.cu Normal file
View File

@ -0,0 +1,261 @@
// **************************************************************************
// lj_dsf.cu
// -------------------
// W. Michael Brown (ORNL)
//
// Device code for acceleration of the lj/cut/coul/dsf pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin : 7/12/2012
// email : brownw@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> q_tex;
#else
texture<int4,1> pos_tex;
texture<int2> q_tex;
#endif
#else
#define pos_tex x_
#define q_tex q_
#endif
#define MY_PIS (acctyp)1.77245385090551602729
__kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
__global numtyp4* lj3, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_ ,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp e_shift, const numtyp f_shift,
const numtyp alpha, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[8];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
sp_lj[4]=sp_lj_in[4];
sp_lj[5]=sp_lj_in[5];
sp_lj[6]=sp_lj_in[6];
sp_lj[7]=sp_lj_in[7];
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int itype=ix.w;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul, r, prefactor, erfcc;
factor_lj = sp_lj[sbmask(j)];
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<lj1[mtype].w) {
numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, force_lj, force, r6inv;
if (rsq < lj1[mtype].z) {
r6inv = r2inv*r2inv*r2inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
r = ucl_sqrt(rsq);
fetch(prefactor,j,q_tex);
prefactor *= factor_coul * qqrd2e*qtmp/r;
numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd +
rsq*f_shift);
} else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq) {
numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift);
e_coul += e;
}
if (rsq < lj1[mtype].z) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}
__kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
__global numtyp4* lj3_in, __global numtyp* sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, __global numtyp *q_,
const numtyp cut_coulsq, const numtyp qqrd2e,
const numtyp e_shift, const numtyp f_shift,
const numtyp alpha, const int t_per_atom) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[8];
if (tid<8)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
lj1[tid]=lj1_in[tid];
if (eflag>0)
lj3[tid]=lj3_in[tid];
}
acctyp energy=(acctyp)0;
acctyp e_coul=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp qtmp; fetch(qtmp,i,q_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
numtyp factor_lj, factor_coul, r, prefactor, erfcc;
factor_lj = sp_lj[sbmask(j)];
factor_coul = sp_lj[sbmask(j)+4];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<lj1[mtype].w) {
numtyp r2inv=ucl_recip(rsq);
numtyp forcecoul, force_lj, force, r6inv;
if (rsq < lj1[mtype].z) {
r6inv = r2inv*r2inv*r2inv;
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
} else
force_lj = (numtyp)0.0;
if (rsq < cut_coulsq) {
r = ucl_sqrt(rsq);
fetch(prefactor,j,q_tex);
prefactor *= factor_coul * qqrd2e*qtmp/r;
numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd +
rsq*f_shift);
} else
forcecoul = (numtyp)0.0;
force = (force_lj + forcecoul) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
if (rsq < cut_coulsq) {
numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift);
e_coul += e;
}
if (rsq < lj1[mtype].z) {
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
energy+=factor_lj*(e-lj3[mtype].z);
}
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
vflag,ans,engv);
} // if ii
}

85
lib/gpu/lal_lj_dsf.h Normal file
View File

@ -0,0 +1,85 @@
/***************************************************************************
lj_dsf.h
-------------------
W. Michael Brown (ORNL)
Class for acceleration of the lj/cut/coul/dsf pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : 7/12/2012
email : brownw@ornl.gov
***************************************************************************/
#ifndef LAL_LJ_DSF_H
#define LAL_LJ_DSF_H
#include "lal_base_charge.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class LJDSF : public BaseCharge<numtyp, acctyp> {
public:
LJDSF();
~LJDSF();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, double **host_cut_ljsq,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double e_shift, const double f_shift,
const double alpha);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq
UCL_D_Vec<numtyp4> lj1;
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
UCL_D_Vec<numtyp4> lj3;
/// Special LJ values [0-3] and Special Coul values [4-7]
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _qqrd2e;
private:
bool _allocated;
numtyp _e_shift, _f_shift, _alpha, _cut_coulsq;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

132
lib/gpu/lal_lj_dsf_ext.cpp Normal file
View File

@ -0,0 +1,132 @@
/***************************************************************************
lj_dsf_ext.cpp
-------------------
W. Michael Brown (ORNL)
Functions for LAMMPS access to lj/cut/coul/dsf acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin : 7/12/2012
email : brownw@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lal_lj_dsf.h"
using namespace std;
using namespace LAMMPS_AL;
static LJDSF<PRECISION,ACC_PRECISION> LJDMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double **offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
double **host_cut_ljsq, const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double e_shift, const double f_shift,
const double alpha) {
LJDMF.clear();
gpu_mode=LJDMF.device->gpu_mode();
double gpu_split=LJDMF.device->particle_split();
int first_gpu=LJDMF.device->first_device();
int last_gpu=LJDMF.device->last_device();
int world_me=LJDMF.device->world_me();
int gpu_rank=LJDMF.device->gpu_rank();
int procs_per_gpu=LJDMF.device->procs_per_gpu();
LJDMF.device->init_message(screen,"lj/cut/coul/dsf",first_gpu,last_gpu);
bool message=false;
if (LJDMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
host_lj4, offset, special_lj, inum, nall, 300,
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, e_shift,
f_shift, alpha);
LJDMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
offset, special_lj, inum, nall, 300, maxspecial,
cell_size, gpu_split, screen, host_cut_ljsq,
host_cut_coulsq, host_special_coul, qqrd2e, e_shift,
f_shift, alpha);
LJDMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
LJDMF.estimate_gpu_overhead();
return init_ok;
}
void ljd_gpu_clear() {
LJDMF.clear();
}
int** ljd_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return LJDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void ljd_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
LJDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
vflag,eatom,vatom,host_start,cpu_time,success,host_q,
nlocal,boxlo,prd);
}
double ljd_gpu_bytes() {
return LJDMF.host_memory_usage();
}

View File

@ -0,0 +1,315 @@
/***************************************************************************
yukawa_colloid.cpp
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the yukawa/colloid pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifdef USE_OPENCL
#include "yukawa_colloid_cl.h"
#elif defined(USE_CUDART)
const char *yukawa_colloid=0;
#else
#include "yukawa_colloid_cubin.h"
#endif
#include "lal_yukawa_colloid.h"
#include <cassert>
using namespace LAMMPS_AL;
#define YukawaColloidT YukawaColloid<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
YukawaColloidT::YukawaColloid() : BaseAtomic<numtyp,acctyp>(),
_allocated(false), _max_rad_size(0) {
}
template <class numtyp, class acctyp>
YukawaColloidT::~YukawaColloid() {
clear();
}
template <class numtyp, class acctyp>
int YukawaColloidT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int YukawaColloidT::init(const int ntypes,
double **host_cutsq, double **host_a,
double **host_offset, double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen, const double kappa) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,yukawa_colloid,"k_yukawa_colloid");
if (success!=0)
return success;
// allocate rad
bool cpuview=false;
if (this->ucl_device->device_type()==UCL_CPU)
cpuview=true;
int ef_nall=nall;
if (ef_nall==0)
ef_nall=2000;
_max_rad_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
host_rad.alloc(_max_rad_size,*(this->ucl_device));
if (cpuview)
dev_rad.view(host_rad);
else
dev_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY);
rad_tex.get_texture(*(this->pair_program),"rad_tex");
rad_tex.bind_float(dev_rad,1);
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
_kappa = kappa;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
UCL_WRITE_OPTIMIZED);
for (int i=0; i<lj_types*lj_types*32; i++)
host_write[i]=(numtyp)0.0;
coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,
host_offset,host_cutsq);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
dview.view(host_special_lj,4,*(this->ucl_device));
ucl_copy(sp_lj,dview,false);
_allocated=true;
this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void YukawaColloidT::clear() {
if (!_allocated)
return;
_allocated=false;
coeff.clear();
sp_lj.clear();
host_rad.clear();
dev_rad.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double YukawaColloidT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(YukawaColloid<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then compute atom energies/forces
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void YukawaColloidT::compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type, int *ilist,
int *numj, int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *rad) {
this->acc_timers();
// ------------------- Resize rad array --------------------------
if (nall>_max_rad_size) {
dev_rad.clear();
host_rad.clear();
_max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
host_rad.alloc(_max_rad_size,*(this->ucl_device));
if (this->ucl_device->device_type()==UCL_CPU) {
if (sizeof(numtyp)==sizeof(double)) {
host_rad.view((numtyp*)rad,nall,*(this->ucl_device));
dev_rad.view(host_rad);
}
} else {
dev_rad.alloc(_max_rad_size,*(this->ucl_device));
}
rad_tex.bind_float(dev_rad,1);
}
// ----------------------------------------------------------------
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return;
}
int ago=this->hd_balancer.ago_first(f_ago);
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
this->ans->inum(inum);
host_start=inum;
// -----------------------------------------------------------------
if (ago==0) {
this->reset_nbors(nall, inum, ilist, numj, firstneigh, success);
if (!success)
return;
}
this->atom->cast_x_data(host_x,host_type);
this->cast_rad_data(rad);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
this->add_rad_data();
this->loop(eflag,vflag);
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans);
this->hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU and then compute per-atom densities
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time, bool &success,
double *rad) {
this->acc_timers();
// ------------------- Resize rad array ----------------------------
if (nall>_max_rad_size) {
dev_rad.clear();
host_rad.clear();
_max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
host_rad.alloc(_max_rad_size,*(this->ucl_device));
if (this->ucl_device->device_type()==UCL_CPU) {
if (sizeof(numtyp)==sizeof(double)) {
host_rad.view((numtyp*)rad,nall,*(this->ucl_device));
dev_rad.view(host_rad);
}
} else {
dev_rad.alloc(_max_rad_size,*(this->ucl_device));
}
rad_tex.bind_float(dev_rad,1);
}
// -----------------------------------------------------------------
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return NULL;
}
// load balance, returning the atom count on the device (inum)
this->hd_balancer.balance(cpu_time);
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
this->ans->inum(inum);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
this->cast_rad_data(rad);
this->hd_balancer.start_timer();
} else {
this->atom->cast_x_data(host_x,host_type);
this->cast_rad_data(rad);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
}
this->add_rad_data();
*ilist=this->nbor->host_ilist.begin();
*jnum=this->nbor->host_acc.begin();
this->loop(eflag,vflag);
this->ans->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans);
this->hd_balancer.stop_timer();
return this->nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Calculate per-atom energies and forces
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void YukawaColloidT::loop(const bool _eflag, const bool _vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int eflag, vflag;
if (_eflag)
eflag=1;
else
eflag=0;
if (_vflag)
vflag=1;
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
if (shared_types) {
this->k_pair_fast.set_size(GX,BX);
this->k_pair_fast.run(&this->atom->x, &dev_rad, &coeff, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
} else {
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &dev_rad, &coeff, &_lj_types, &sp_lj,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->ans->force, &this->ans->engv, &eflag, &vflag,
&ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
}
this->time_pair.stop();
}
template class YukawaColloid<PRECISION,ACC_PRECISION>;

View File

@ -0,0 +1,202 @@
// **************************************************************************
// yukawa_colloid.cu
// -------------------
// Trung Dac Nguyen (ORNL)
//
// Device code for acceleration of the yukawa/colloid pair style
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : nguyentd@ornl.gov
// ***************************************************************************/
#ifdef NV_KERNEL
#include "lal_aux_fun1.h"
#ifndef _DOUBLE_DOUBLE
texture<float4> pos_tex;
texture<float> rad_tex;
#else
texture<int4,1> pos_tex;
texture<int2> rad_tex;
#endif
#else
#define pos_tex x_
#define rad_tex rad_
#endif
__kernel void k_yukawa_colloid(__global numtyp4 *x_, __global numtyp *rad_,
__global numtyp4 *coeff, const int lj_types,
__global numtyp *sp_lj_in, __global int *dev_nbor,
__global int *dev_packed, __global acctyp4 *ans,
__global acctyp *engv, const int eflag,
const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
const numtyp kappa) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp sp_lj[4];
sp_lj[0]=sp_lj_in[0];
sp_lj[1]=sp_lj_in[1];
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp radi; fetch(radi,i,rad_tex);
int itype=ix.w;
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
numtyp radj; fetch(radj,j,rad_tex);
int jtype=jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<coeff[mtype].z) {
numtyp r = ucl_sqrt(rsq);
numtyp rinv = ucl_recip(r);
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
numtyp force = coeff[mtype].x * screening;
force = factor_lj*force * rinv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=coeff[mtype].x/kappa * screening;
energy+=factor_lj*(e-coeff[mtype].y);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv);
} // if ii
}
__kernel void k_yukawa_colloid_fast(__global numtyp4 *x_, __global numtyp *rad_,
__global numtyp4 *coeff_in, __global numtyp *sp_lj_in,
__global int *dev_nbor, __global int *dev_packed,
__global acctyp4 *ans, __global acctyp *engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom,
const numtyp kappa) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
__local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
__local numtyp sp_lj[4];
if (tid<4)
sp_lj[tid]=sp_lj_in[tid];
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
coeff[tid]=coeff_in[tid];
}
acctyp energy=(acctyp)0;
acctyp4 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp virial[6];
for (int i=0; i<6; i++)
virial[i]=(acctyp)0;
__syncthreads();
if (ii<inum) {
__global int *nbor, *list_end;
int i, numj, n_stride;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,list_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
numtyp radi; fetch(radi,i,rad_tex);
int iw=ix.w;
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
numtyp factor_lj;
for ( ; nbor<list_end; nbor+=n_stride) {
int j=*nbor;
factor_lj = sp_lj[sbmask(j)];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
numtyp radj; fetch(radj,j,rad_tex);
int mtype=itype+jx.w;
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<coeff[mtype].z) {
numtyp r = ucl_sqrt(rsq);
numtyp rinv = ucl_recip(r);
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
numtyp force = coeff[mtype].x * screening;
force = factor_lj*force * rinv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
numtyp e=coeff[mtype].x/kappa * screening;
energy+=factor_lj*(e-coeff[mtype].y);
}
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
}
} // for nbor
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
ans,engv);
} // if ii
}

View File

@ -0,0 +1,127 @@
/***************************************************************************
yukawa_colloid.h
-------------------
Trung Dac Nguyen (ORNL)
Class for acceleration of the yukawa/colloid pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#ifndef LAL_YUKAWA_COLLOID_H
#define LAL_YUKAWA_COLLOID_H
#include "lal_base_atomic.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class YukawaColloid : public BaseAtomic<numtyp, acctyp> {
public:
YukawaColloid();
~YukawaColloid();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, double **host_cutsq,
double **host_a, double **host_offset, double *host_special_lj,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, const double kappa);
inline void cast_rad_data(double* rad) {
int nall = this->atom->nall();
if (this->ucl_device->device_type()==UCL_CPU) {
if (sizeof(numtyp)==sizeof(double)) {
host_rad.view((numtyp*)rad,nall,*(this->ucl_device));
dev_rad.view(host_rad);
} else {
for (int i=0; i<nall; i++) host_rad[i]=rad[i];
}
} else {
if (sizeof(numtyp)==sizeof(double))
memcpy(host_rad.begin(),rad,nall*sizeof(numtyp));
else {
for (int i=0; i<nall; i++) host_rad[i]=rad[i];
}
}
}
// Copy rad to device asynchronously
inline void add_rad_data() {
ucl_copy(dev_rad,host_rad,this->atom->nall(),true);
}
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *rad);
/// Pair loop with device neighboring
int** compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *rad);
// --------------------------- TEXTURES -----------------------------
UCL_Texture rad_tex;
// --------------------------- TYPE DATA --------------------------
/// coeff.x = a, coeff.y = offset, coeff.z = cutsq
UCL_D_Vec<numtyp4> coeff;
/// Special LJ values
UCL_D_Vec<numtyp> sp_lj;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
int _max_rad_size;
numtyp _kappa;
/// Per-atom arrays
UCL_H_Vec<numtyp> host_rad;
UCL_D_Vec<numtyp> dev_rad;
private:
bool _allocated;
void loop(const bool _eflag, const bool _vflag);
};
}
#endif

View File

@ -0,0 +1,123 @@
/***************************************************************************
yukawa_colloid_ext.cpp
-------------------
Trung Dac Nguyen (ORNL)
Functions for LAMMPS access to colloid acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : nguyentd@ornl.gov
***************************************************************************/
#include <iostream>
#include <cassert>
#include <math.h>
#include "lal_yukawa_colloid.h"
using namespace std;
using namespace LAMMPS_AL;
static YukawaColloid<PRECISION,ACC_PRECISION> YKCOLLMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
double **host_offset, double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen,
const double kappa) {
YKCOLLMF.clear();
gpu_mode=YKCOLLMF.device->gpu_mode();
double gpu_split=YKCOLLMF.device->particle_split();
int first_gpu=YKCOLLMF.device->first_device();
int last_gpu=YKCOLLMF.device->last_device();
int world_me=YKCOLLMF.device->world_me();
int gpu_rank=YKCOLLMF.device->gpu_rank();
int procs_per_gpu=YKCOLLMF.device->procs_per_gpu();
YKCOLLMF.device->init_message(screen,"yukawa/colloid",first_gpu,last_gpu);
bool message=false;
if (YKCOLLMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj,
inum, nall, 300, maxspecial, cell_size, gpu_split,
screen, kappa);
YKCOLLMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj,
inum, nall, 300, maxspecial, cell_size, gpu_split,
screen, kappa);
YKCOLLMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
YKCOLLMF.estimate_gpu_overhead();
return init_ok;
}
void ykcolloid_gpu_clear() {
YKCOLLMF.clear();
}
int ** ykcolloid_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, int *tag, int **nspecial,
int **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_rad) {
return YKCOLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_rad);
}
void ykcolloid_gpu_compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_rad) {
YKCOLLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,
success,host_rad);
}
double ykcolloid_gpu_bytes() {
return YKCOLLMF.host_memory_usage();
}