forked from lijiext/lammps
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8694 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
31551d81fd
commit
647ea4c29f
|
@ -0,0 +1,224 @@
|
||||||
|
/***************************************************************************
|
||||||
|
ucl_matrix.h
|
||||||
|
-------------------
|
||||||
|
W. Michael Brown
|
||||||
|
|
||||||
|
Matrix Container on Host
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin : Thu May 10 2012
|
||||||
|
copyright : (C) 2012 by W. Michael Brown
|
||||||
|
email : brownw@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
/* -----------------------------------------------------------------------
|
||||||
|
This software is distributed under the Simplified BSD License.
|
||||||
|
----------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
// Only allow this file to be included by CUDA and OpenCL specific headers
|
||||||
|
#ifdef _UCL_MAT_ALLOW
|
||||||
|
|
||||||
|
/// Matrix S-Object
|
||||||
|
template <class hosttype, class devtype>
|
||||||
|
class UCL_Matrix {
|
||||||
|
public:
|
||||||
|
// Traits for copying data
|
||||||
|
// MEM_TYPE is 0 for device, 1 for host, and 2 for image
|
||||||
|
enum traits {
|
||||||
|
DATA_TYPE = _UCL_DATA_ID<hosttype>::id,
|
||||||
|
MEM_TYPE = 1,
|
||||||
|
PADDED = 0,
|
||||||
|
ROW_MAJOR = 1,
|
||||||
|
VECTOR = 0
|
||||||
|
};
|
||||||
|
typedef hosttype data_type;
|
||||||
|
|
||||||
|
/// Host Allocation
|
||||||
|
UCL_H_Mat<hosttype> host;
|
||||||
|
|
||||||
|
/// Device Allocation
|
||||||
|
UCL_D_Mat<devtype> device;
|
||||||
|
|
||||||
|
UCL_Matrix() { }
|
||||||
|
~UCL_Matrix() { }
|
||||||
|
|
||||||
|
/// Construct with specied number of rows and columns
|
||||||
|
/** \sa alloc() **/
|
||||||
|
UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc,
|
||||||
|
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
|
||||||
|
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
|
||||||
|
|
||||||
|
/// Set up host matrix with specied # of rows/cols and reserve memory
|
||||||
|
/** The kind1 parameter controls memory pinning as follows:
|
||||||
|
* - UCL_NOT_PINNED - Memory is not pinned
|
||||||
|
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
|
||||||
|
* - UCL_RW_OPTIMIZED - Memory can be pinned
|
||||||
|
* The kind2 parameter controls memory optimizations as follows:
|
||||||
|
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
||||||
|
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
||||||
|
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
||||||
|
* \note When passing a command queue instead of a device, the device
|
||||||
|
* allocation is always performed. Even if the device shares memory
|
||||||
|
* with the host.
|
||||||
|
* \param cq Default command queue for operations copied from another mat
|
||||||
|
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||||
|
template <class mat_type>
|
||||||
|
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
|
||||||
|
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
|
||||||
|
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||||
|
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); }
|
||||||
|
|
||||||
|
/// Set up host matrix with specied # of rows/cols and reserve memory
|
||||||
|
/** The kind1 parameter controls memory pinning as follows:
|
||||||
|
* - UCL_NOT_PINNED - Memory is not pinned
|
||||||
|
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
|
||||||
|
* - UCL_RW_OPTIMIZED - Memory can be pinned
|
||||||
|
* The kind2 parameter controls memory optimizations as follows:
|
||||||
|
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
||||||
|
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
||||||
|
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
||||||
|
* \param device Used to get the default command queue for operations
|
||||||
|
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||||
|
inline int alloc(const size_t rows, const size_t cols, UCL_Device &acc,
|
||||||
|
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
|
||||||
|
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||||
|
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); }
|
||||||
|
|
||||||
|
/// Free memory and set size to 0
|
||||||
|
inline void clear()
|
||||||
|
{ host.clear(); device.clear(); }
|
||||||
|
|
||||||
|
/// Resize the allocation to contain cols elements
|
||||||
|
inline int resize(const int rows, const int cols) {
|
||||||
|
assert(host.kind()!=UCL_VIEW);
|
||||||
|
int err=host.resize(rows,cols);
|
||||||
|
if (err!=UCL_SUCCESS)
|
||||||
|
return err;
|
||||||
|
return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
dev_resize(device,host,_buffer,rows,cols);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resize (only if bigger) the allocation to contain cols elements
|
||||||
|
inline int resize_ib(const int new_rows, const int new_cols)
|
||||||
|
{ if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols);
|
||||||
|
else return UCL_SUCCESS; }
|
||||||
|
|
||||||
|
/// Set each element to zero
|
||||||
|
inline void zero() { host.zero(); device.zero(); }
|
||||||
|
|
||||||
|
/// Set first n elements to zero
|
||||||
|
inline void zero(const int n) { host.zero(n); device.zero(n); }
|
||||||
|
|
||||||
|
/// Get the number of elements
|
||||||
|
inline size_t numel() const { return host.numel(); }
|
||||||
|
/// Get the number of rows
|
||||||
|
inline size_t rows() const { return host.rows(); }
|
||||||
|
/// Get the number of columns
|
||||||
|
inline size_t cols() const { return host.cols(); }
|
||||||
|
/// Get the memory usage (bytes) of the s-object (including any buffers)
|
||||||
|
inline size_t host_mem_usage()
|
||||||
|
{ return host.row_bytes()*host.rows()+_buffer.row_bytes()*_buffer.rows(); }
|
||||||
|
/// Get the memory usage (bytes) of the s-object (including any buffers)
|
||||||
|
inline size_t device_mem_usage()
|
||||||
|
{ return device.row_bytes()*device.rows(); }
|
||||||
|
|
||||||
|
/// Get element at index i
|
||||||
|
inline hosttype & operator[](const int i) { return host[i]; }
|
||||||
|
/// Get element at index i
|
||||||
|
inline const hosttype & operator[](const int i) const { return host[i]; }
|
||||||
|
/// 2D access (row should always be 0)
|
||||||
|
inline hosttype & operator()(const int row, const int col)
|
||||||
|
{ return host(row,col); }
|
||||||
|
/// 2D access (row should always be 0)
|
||||||
|
inline const hosttype & operator()(const int row, const int col) const
|
||||||
|
{ return host(row,col); }
|
||||||
|
|
||||||
|
/// Returns pointer to memory pointer for allocation on host
|
||||||
|
inline hosttype ** host_ptr() { return host.host_ptr(); }
|
||||||
|
|
||||||
|
/// Return the default command queue/stream associated with this data
|
||||||
|
inline command_queue & cq() { return host.cq(); }
|
||||||
|
/// Block until command_queue associated with matrix is complete
|
||||||
|
inline void sync() { host.sync(); }
|
||||||
|
|
||||||
|
///Get the size of a row on the host (including any padding) in elements
|
||||||
|
inline size_t row_size() const { return host.row_size(); }
|
||||||
|
/// Get the size of a row on the host(including any padding) in bytes
|
||||||
|
inline size_t row_bytes() const { return host.row_bytes(); }
|
||||||
|
/// Get the size on the host in bytes of 1 element
|
||||||
|
inline int element_size() const { return sizeof(hosttype); }
|
||||||
|
|
||||||
|
|
||||||
|
/// Update the allocation on the host asynchronously
|
||||||
|
inline void update_host()
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(host,device,_buffer,true); }
|
||||||
|
/// Update the allocation on the host (true for asynchronous copy)
|
||||||
|
inline void update_host(const bool async)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(host,device,_buffer,async); }
|
||||||
|
/// Update the allocation on the host (using command queue)
|
||||||
|
inline void update_host(command_queue &cq)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(host,device,_buffer,cq); }
|
||||||
|
/// Update the first n elements on the host (true for asynchronous copy)
|
||||||
|
inline void update_host(const int n, const bool async)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(host,device,n,_buffer,async); }
|
||||||
|
/// Update the first n elements on the host (using command queue)
|
||||||
|
inline void update_host(const int n, command_queue &cq)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(host,device,n,_buffer,cq); }
|
||||||
|
/// Update slice on the host (true for asynchronous copy)
|
||||||
|
inline void update_host(const int rows, const int cols, const bool async)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(host,device,rows,cols,_buffer,async); }
|
||||||
|
/// Update slice on the host (using command queue)
|
||||||
|
inline void update_host(const int rows, const int cols, command_queue &cq)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(host,device,rows,cols,_buffer,cq); }
|
||||||
|
|
||||||
|
|
||||||
|
/// Update the allocation on the device asynchronously
|
||||||
|
inline void update_device()
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(device,host,_buffer,true); }
|
||||||
|
/// Update the allocation on the device (true for asynchronous copy)
|
||||||
|
inline void update_device(const bool async)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(device,host,_buffer,async); }
|
||||||
|
/// Update the allocation on the device (using command queue)
|
||||||
|
inline void update_device(command_queue &cq)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(device,host,_buffer,cq); }
|
||||||
|
/// Update the first n elements on the device (true for asynchronous copy)
|
||||||
|
inline void update_device(const int n, const bool async)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(device,host,n,_buffer,async); }
|
||||||
|
/// Update the first n elements on the device (using command queue)
|
||||||
|
inline void update_device(const int n, command_queue &cq)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(device,host,n,_buffer,cq); }
|
||||||
|
/// Update slice on the device (true for asynchronous copy)
|
||||||
|
inline void update_device(const int rows, const int cols, const bool async)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(device,host,rows,cols,_buffer,async); }
|
||||||
|
/// Update slice on the device (using command queue)
|
||||||
|
inline void update_device(const int rows, const int cols, command_queue &cq)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(device,host,rows,cols,_buffer,cq); }
|
||||||
|
|
||||||
|
|
||||||
|
private:
|
||||||
|
UCL_H_Mat<devtype> _buffer;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
|
@ -0,0 +1,272 @@
|
||||||
|
/***************************************************************************
|
||||||
|
ucl_s_obj_help.h
|
||||||
|
-------------------
|
||||||
|
W. Michael Brown
|
||||||
|
|
||||||
|
Helper routines for allocating memory for s-objects and performing
|
||||||
|
host/device updates. (Different routines depending on whether the
|
||||||
|
same type is used on the host and device).
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin : Mon May 14 2012
|
||||||
|
copyright : (C) 2012 by W. Michael Brown
|
||||||
|
email : brownw@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
/* -----------------------------------------------------------------------
|
||||||
|
This software is distributed under the Simplified BSD License.
|
||||||
|
----------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
template <int st> struct _ucl_s_obj_help;
|
||||||
|
|
||||||
|
// Host and device containers are same type
|
||||||
|
// -- Don't need casting buffers
|
||||||
|
// -- Can potentially use same memory if shared by accelerator
|
||||||
|
template <> struct _ucl_s_obj_help<1> {
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
|
||||||
|
const int cols, UCL_Device &acc,
|
||||||
|
const enum UCL_MEMOPT kind1,
|
||||||
|
const enum UCL_MEMOPT kind2) {
|
||||||
|
int e1;
|
||||||
|
e1=host.alloc(cols,acc,kind1);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
if (acc.shared_memory()) {
|
||||||
|
device.view(host);
|
||||||
|
return UCL_SUCCESS;
|
||||||
|
} else
|
||||||
|
return device.alloc(cols,acc,kind2);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class mat_type>
|
||||||
|
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
|
||||||
|
const int cols, mat_type &cq,
|
||||||
|
const enum UCL_MEMOPT kind1,
|
||||||
|
const enum UCL_MEMOPT kind2) {
|
||||||
|
int e1;
|
||||||
|
e1=host.alloc(cols,cq,kind1);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
return device.alloc(cols,cq,kind2);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
|
||||||
|
const int rows, const int cols, UCL_Device &acc,
|
||||||
|
const enum UCL_MEMOPT kind1,
|
||||||
|
const enum UCL_MEMOPT kind2) {
|
||||||
|
int e1;
|
||||||
|
e1=host.alloc(rows,cols,acc,kind1);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
if (acc.shared_memory()) {
|
||||||
|
device.view(host);
|
||||||
|
return UCL_SUCCESS;
|
||||||
|
} else
|
||||||
|
return device.alloc(rows,cols,acc,kind2);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class mat_type>
|
||||||
|
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
|
||||||
|
const int rows, const int cols, mat_type &cq,
|
||||||
|
const enum UCL_MEMOPT kind1,
|
||||||
|
const enum UCL_MEMOPT kind2) {
|
||||||
|
int e1;
|
||||||
|
e1=host.alloc(rows,cols,cq,kind1);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
return device.alloc(rows,cols,cq,kind2);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) {
|
||||||
|
ucl_copy(dst,src,async);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) {
|
||||||
|
ucl_copy(dst,src,cq);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
|
||||||
|
const bool async) {
|
||||||
|
ucl_copy(dst,src,cols,async);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
|
||||||
|
command_queue &cq) {
|
||||||
|
ucl_copy(dst,src,cols,cq);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
|
||||||
|
t3 &buffer, const bool async) {
|
||||||
|
ucl_copy(dst,src,rows,cols,async);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
|
||||||
|
t3 &buffer, command_queue &cq) {
|
||||||
|
ucl_copy(dst,src,rows,cols,cq);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
|
||||||
|
if (device.kind()==UCL_VIEW) {
|
||||||
|
device.view(host);
|
||||||
|
return UCL_SUCCESS;
|
||||||
|
} else
|
||||||
|
return device.resize(cols);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows,
|
||||||
|
const int cols) {
|
||||||
|
if (device.kind()==UCL_VIEW) {
|
||||||
|
device.view(host);
|
||||||
|
return UCL_SUCCESS;
|
||||||
|
} else
|
||||||
|
return device.resize(rows,cols);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Host and device containers are different types
|
||||||
|
template <int st> struct _ucl_s_obj_help {
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
|
||||||
|
const int cols, UCL_Device &acc,
|
||||||
|
const enum UCL_MEMOPT kind1,
|
||||||
|
const enum UCL_MEMOPT kind2) {
|
||||||
|
int e1;
|
||||||
|
e1=host.alloc(cols,acc,UCL_NOT_PINNED);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
e1=_buffer.alloc(cols,acc,kind1);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
|
||||||
|
if (acc.shared_memory()) {
|
||||||
|
device.view(_buffer);
|
||||||
|
return UCL_SUCCESS;
|
||||||
|
} else
|
||||||
|
return device.alloc(cols,acc,kind2);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class mat_type>
|
||||||
|
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
|
||||||
|
const int cols, mat_type &cq,
|
||||||
|
const enum UCL_MEMOPT kind1,
|
||||||
|
const enum UCL_MEMOPT kind2) {
|
||||||
|
int e1;
|
||||||
|
e1=host.alloc(cols,cq,UCL_NOT_PINNED);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
e1=_buffer.alloc(cols,cq,kind1);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
return device.alloc(cols,cq,kind2);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
|
||||||
|
const int rows, const int cols, UCL_Device &acc,
|
||||||
|
const enum UCL_MEMOPT kind1,
|
||||||
|
const enum UCL_MEMOPT kind2) {
|
||||||
|
int e1;
|
||||||
|
e1=host.alloc(rows,cols,acc,UCL_NOT_PINNED);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
e1=_buffer.alloc(rows,cols,acc,kind1);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
|
||||||
|
if (acc.shared_memory()) {
|
||||||
|
device.view(_buffer);
|
||||||
|
return UCL_SUCCESS;
|
||||||
|
} else
|
||||||
|
return device.alloc(rows,cols,acc,kind2);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3, class mat_type>
|
||||||
|
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
|
||||||
|
const int rows, const int cols, mat_type &cq,
|
||||||
|
const enum UCL_MEMOPT kind1,
|
||||||
|
const enum UCL_MEMOPT kind2) {
|
||||||
|
int e1;
|
||||||
|
e1=host.alloc(rows,cols,cq,UCL_NOT_PINNED);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
e1=_buffer.alloc(rows,cols,cq,kind1);
|
||||||
|
if (e1!=UCL_SUCCESS)
|
||||||
|
return e1;
|
||||||
|
return device.alloc(rows,cols,cq,kind2);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) {
|
||||||
|
ucl_cast_copy(dst,src,buffer,async);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) {
|
||||||
|
ucl_cast_copy(dst,src,buffer,cq);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
|
||||||
|
const bool async) {
|
||||||
|
ucl_cast_copy(dst,src,cols,buffer,async);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
|
||||||
|
command_queue &cq) {
|
||||||
|
ucl_cast_copy(dst,src,cols,buffer,cq);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
|
||||||
|
t3 &buffer, const bool async) {
|
||||||
|
ucl_cast_copy(dst,src,rows,cols,buffer,async);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
|
||||||
|
t3 &buffer, command_queue &cq) {
|
||||||
|
ucl_cast_copy(dst,src,rows,cols,buffer,cq);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
|
||||||
|
int err=buff.resize(cols);
|
||||||
|
if (err!=UCL_SUCCESS)
|
||||||
|
return err;
|
||||||
|
|
||||||
|
if (device.kind()==UCL_VIEW) {
|
||||||
|
device.view(buff);
|
||||||
|
return UCL_SUCCESS;
|
||||||
|
} else
|
||||||
|
return device.resize(cols);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class t1, class t2, class t3>
|
||||||
|
static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows,
|
||||||
|
const int cols) {
|
||||||
|
int err=buff.resize(rows,cols);
|
||||||
|
if (err!=UCL_SUCCESS)
|
||||||
|
return err;
|
||||||
|
|
||||||
|
if (device.kind()==UCL_VIEW) {
|
||||||
|
device.view(buff);
|
||||||
|
return UCL_SUCCESS;
|
||||||
|
} else
|
||||||
|
return device.resize(rows,cols);
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
|
@ -0,0 +1,223 @@
|
||||||
|
/***************************************************************************
|
||||||
|
ucl_vector.h
|
||||||
|
-------------------
|
||||||
|
W. Michael Brown
|
||||||
|
|
||||||
|
Vector Container on Host
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin : Thu May 10 2012
|
||||||
|
copyright : (C) 2012 by W. Michael Brown
|
||||||
|
email : brownw@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
/* -----------------------------------------------------------------------
|
||||||
|
This software is distributed under the Simplified BSD License.
|
||||||
|
----------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
// Only allow this file to be included by CUDA and OpenCL specific headers
|
||||||
|
#ifdef _UCL_MAT_ALLOW
|
||||||
|
|
||||||
|
/// Row Vector S-Object
|
||||||
|
template <class hosttype, class devtype>
|
||||||
|
class UCL_Vector {
|
||||||
|
public:
|
||||||
|
// Traits for copying data
|
||||||
|
// MEM_TYPE is 0 for device, 1 for host, and 2 for image
|
||||||
|
enum traits {
|
||||||
|
DATA_TYPE = _UCL_DATA_ID<hosttype>::id,
|
||||||
|
MEM_TYPE = 1,
|
||||||
|
PADDED = 0,
|
||||||
|
ROW_MAJOR = 1,
|
||||||
|
VECTOR = 1
|
||||||
|
};
|
||||||
|
typedef hosttype data_type;
|
||||||
|
|
||||||
|
/// Host Allocation
|
||||||
|
UCL_H_Vec<hosttype> host;
|
||||||
|
|
||||||
|
/// Device Allocation
|
||||||
|
UCL_D_Vec<devtype> device;
|
||||||
|
|
||||||
|
UCL_Vector() { }
|
||||||
|
~UCL_Vector() { }
|
||||||
|
|
||||||
|
/// Construct with n columns
|
||||||
|
/** \sa alloc() **/
|
||||||
|
UCL_Vector(const size_t cols, UCL_Device &acc,
|
||||||
|
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
|
||||||
|
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
alloc(host,device,_buffer,cols,acc,kind1,kind2); }
|
||||||
|
|
||||||
|
/// Set up the vector with 'cols' columns and reserve memory
|
||||||
|
/** The kind1 parameter controls memory pinning as follows:
|
||||||
|
* - UCL_NOT_PINNED - Memory is not pinned
|
||||||
|
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
|
||||||
|
* - UCL_RW_OPTIMIZED - Memory can be pinned
|
||||||
|
* The kind2 parameter controls memory optimizations as follows:
|
||||||
|
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
||||||
|
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
||||||
|
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
||||||
|
* \note When passing a command queue instead of a device, the device
|
||||||
|
* allocation is always performed. Even if the device shares memory
|
||||||
|
* with the host.
|
||||||
|
* \param cq Default command queue for operations copied from another mat
|
||||||
|
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||||
|
template <class mat_type>
|
||||||
|
inline int alloc(const size_t cols, mat_type &cq,
|
||||||
|
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
|
||||||
|
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||||
|
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
alloc(host,device,_buffer,cols,cq,kind1,kind2); }
|
||||||
|
|
||||||
|
/// Set up host vector with 'cols' columns and reserve memory
|
||||||
|
/** The kind1 parameter controls memory pinning as follows:
|
||||||
|
* - UCL_NOT_PINNED - Memory is not pinned
|
||||||
|
* - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
|
||||||
|
* - UCL_RW_OPTIMIZED - Memory can be pinned
|
||||||
|
* The kind2 parameter controls memory optimizations as follows:
|
||||||
|
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
||||||
|
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
||||||
|
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
||||||
|
* \param device Used to get the default command queue for operations
|
||||||
|
* \return UCL_SUCCESS if the memory allocation is successful **/
|
||||||
|
inline int alloc(const size_t cols, UCL_Device &acc,
|
||||||
|
const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED,
|
||||||
|
const enum UCL_MEMOPT kind2=UCL_READ_WRITE)
|
||||||
|
{ return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
alloc(host,device,_buffer,cols,acc,kind1,kind2); }
|
||||||
|
|
||||||
|
/// Free memory and set size to 0
|
||||||
|
inline void clear()
|
||||||
|
{ host.clear(); device.clear(); }
|
||||||
|
|
||||||
|
/// Resize the allocation to contain cols elements
|
||||||
|
inline int resize(const int cols) {
|
||||||
|
assert(host.kind()!=UCL_VIEW);
|
||||||
|
int err=host.resize(cols);
|
||||||
|
if (err!=UCL_SUCCESS)
|
||||||
|
return err;
|
||||||
|
return _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
dev_resize(device,host,_buffer,cols);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resize (only if bigger) the allocation to contain cols elements
|
||||||
|
inline int resize_ib(const int new_cols)
|
||||||
|
{ if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; }
|
||||||
|
|
||||||
|
/// Set each element to zero
|
||||||
|
inline void zero() { host.zero(); device.zero(); }
|
||||||
|
|
||||||
|
/// Set first n elements to zero
|
||||||
|
inline void zero(const int n) { host.zero(n); device.zero(n); }
|
||||||
|
|
||||||
|
/// Get the number of elements
|
||||||
|
inline size_t numel() const { return host.numel(); }
|
||||||
|
/// Get the number of rows
|
||||||
|
inline size_t rows() const { return host.rows(); }
|
||||||
|
/// Get the number of columns
|
||||||
|
inline size_t cols() const { return host.cols(); }
|
||||||
|
/// Get the memory usage (bytes) of the s-object (including any buffers)
|
||||||
|
inline size_t host_mem_usage()
|
||||||
|
{ return host.row_bytes()+_buffer.row_bytes(); }
|
||||||
|
/// Get the memory usage (bytes) of the s-object (including any buffers)
|
||||||
|
inline size_t device_mem_usage()
|
||||||
|
{ return device.row_bytes(); }
|
||||||
|
|
||||||
|
|
||||||
|
/// Get element at index i
|
||||||
|
inline hosttype & operator[](const int i) { return host[i]; }
|
||||||
|
/// Get element at index i
|
||||||
|
inline const hosttype & operator[](const int i) const { return host[i]; }
|
||||||
|
/// 2D access (row should always be 0)
|
||||||
|
inline hosttype & operator()(const int row, const int col)
|
||||||
|
{ return host[col]; }
|
||||||
|
/// 2D access (row should always be 0)
|
||||||
|
inline const hosttype & operator()(const int row, const int col) const
|
||||||
|
{ return host[col]; }
|
||||||
|
|
||||||
|
/// Returns pointer to memory pointer for allocation on host
|
||||||
|
inline hosttype ** host_ptr() { return host.host_ptr(); }
|
||||||
|
|
||||||
|
/// Return the default command queue/stream associated with this data
|
||||||
|
inline command_queue & cq() { return host.cq(); }
|
||||||
|
/// Block until command_queue associated with matrix is complete
|
||||||
|
inline void sync() { host.sync(); }
|
||||||
|
|
||||||
|
///Get the size of a row on the host (including any padding) in elements
|
||||||
|
inline size_t row_size() const { return host.row_size(); }
|
||||||
|
/// Get the size of a row on the host(including any padding) in bytes
|
||||||
|
inline size_t row_bytes() const { return host.row_bytes(); }
|
||||||
|
/// Get the size on the host in bytes of 1 element
|
||||||
|
inline int element_size() const { return sizeof(hosttype); }
|
||||||
|
|
||||||
|
|
||||||
|
/// Update the allocation on the host asynchronously
|
||||||
|
inline void update_host()
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(host,device,_buffer,true); }
|
||||||
|
/// Update the allocation on the host (true for asynchronous copy)
|
||||||
|
inline void update_host(const bool async)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(host,device,_buffer,async); }
|
||||||
|
/// Update the allocation on the host (using command queue)
|
||||||
|
inline void update_host(command_queue &cq)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(host,device,_buffer,cq); }
|
||||||
|
/// Update the first n elements on the host (true for asynchronous copy)
|
||||||
|
inline void update_host(const int n, const bool async)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(host,device,n,_buffer,async); }
|
||||||
|
/// Update the first n elements on the host (using command queue)
|
||||||
|
inline void update_host(const int n, command_queue &cq)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(host,device,n,_buffer,cq); }
|
||||||
|
/// Update slice on the host (true for asynchronous copy)
|
||||||
|
inline void update_host(const int rows, const int cols, const bool async)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(host,device,rows,cols,_buffer,async); }
|
||||||
|
/// Update slice on the host (using command queue)
|
||||||
|
inline void update_host(const int rows, const int cols, command_queue &cq)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(host,device,rows,cols,_buffer,cq); }
|
||||||
|
|
||||||
|
|
||||||
|
/// Update the allocation on the device asynchronously
|
||||||
|
inline void update_device()
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(device,host,_buffer,true); }
|
||||||
|
/// Update the allocation on the device (true for asynchronous copy)
|
||||||
|
inline void update_device(const bool async)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(device,host,_buffer,async); }
|
||||||
|
/// Update the allocation on the device (using command queue)
|
||||||
|
inline void update_device(command_queue &cq)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(device,host,_buffer,cq); }
|
||||||
|
/// Update the first n elements on the device (true for asynchronous copy)
|
||||||
|
inline void update_device(const int n, const bool async)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(device,host,n,_buffer,async); }
|
||||||
|
/// Update the first n elements on the device (using command queue)
|
||||||
|
inline void update_device(const int n, command_queue &cq)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(device,host,n,_buffer,cq); }
|
||||||
|
/// Update slice on the device (true for asynchronous copy)
|
||||||
|
inline void update_device(const int rows, const int cols, const bool async)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(device,host,rows,cols,_buffer,async); }
|
||||||
|
/// Update slice on the device (using command queue)
|
||||||
|
inline void update_device(const int rows, const int cols, command_queue &cq)
|
||||||
|
{ _ucl_s_obj_help< ucl_same_type<hosttype,devtype>::ans >::
|
||||||
|
copy(device,host,rows,cols,_buffer,cq); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
UCL_H_Vec<devtype> _buffer;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
|
@ -0,0 +1,315 @@
|
||||||
|
/***************************************************************************
|
||||||
|
base_dipole.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Base class for pair styles needing per-particle data for position,
|
||||||
|
dipole, and type.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include "lal_base_dipole.h"
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
#define BaseDipoleT BaseDipole<numtyp, acctyp>
|
||||||
|
|
||||||
|
extern Device<PRECISION,ACC_PRECISION> global_device;
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
BaseDipoleT::BaseDipole() : _compiled(false), _max_bytes(0) {
|
||||||
|
device=&global_device;
|
||||||
|
ans=new Answer<numtyp,acctyp>();
|
||||||
|
nbor=new Neighbor();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
BaseDipoleT::~BaseDipole() {
|
||||||
|
delete ans;
|
||||||
|
delete nbor;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int BaseDipoleT::bytes_per_atom_atomic(const int max_nbors) const {
|
||||||
|
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
|
||||||
|
nbor->bytes_per_atom(max_nbors);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int BaseDipoleT::init_atomic(const int nlocal, const int nall,
|
||||||
|
const int max_nbors, const int maxspecial,
|
||||||
|
const double cell_size,
|
||||||
|
const double gpu_split, FILE *_screen,
|
||||||
|
const void *pair_program,
|
||||||
|
const char *k_name) {
|
||||||
|
screen=_screen;
|
||||||
|
|
||||||
|
int gpu_nbor=0;
|
||||||
|
if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
|
||||||
|
gpu_nbor=1;
|
||||||
|
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
|
||||||
|
gpu_nbor=2;
|
||||||
|
|
||||||
|
int _gpu_host=0;
|
||||||
|
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
|
||||||
|
if (host_nlocal>0)
|
||||||
|
_gpu_host=1;
|
||||||
|
|
||||||
|
_threads_per_atom=device->threads_per_charge();
|
||||||
|
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||||
|
nbor->packing(true);
|
||||||
|
_nbor_data=&(nbor->dev_packed);
|
||||||
|
} else
|
||||||
|
_nbor_data=&(nbor->dev_nbor);
|
||||||
|
|
||||||
|
int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor,
|
||||||
|
maxspecial,_gpu_host,max_nbors,cell_size,false,
|
||||||
|
_threads_per_atom);
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
|
ucl_device=device->gpu;
|
||||||
|
atom=&device->atom;
|
||||||
|
|
||||||
|
_block_size=device->pair_block_size();
|
||||||
|
_block_bio_size=device->block_bio_pair();
|
||||||
|
compile_kernels(*ucl_device,pair_program,k_name);
|
||||||
|
|
||||||
|
// Initialize host-device load balancer
|
||||||
|
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||||
|
|
||||||
|
// Initialize timers for the selected GPU
|
||||||
|
time_pair.init(*ucl_device);
|
||||||
|
time_pair.zero();
|
||||||
|
|
||||||
|
pos_tex.bind_float(atom->x,4);
|
||||||
|
q_tex.bind_float(atom->q,1);
|
||||||
|
mu_tex.bind_float(atom->quat,4);
|
||||||
|
|
||||||
|
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||||
|
|
||||||
|
return success;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void BaseDipoleT::estimate_gpu_overhead() {
|
||||||
|
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void BaseDipoleT::clear_atomic() {
|
||||||
|
// Output any timing information
|
||||||
|
acc_timers();
|
||||||
|
double avg_split=hd_balancer.all_avg_split();
|
||||||
|
_gpu_overhead*=hd_balancer.timestep();
|
||||||
|
_driver_overhead*=hd_balancer.timestep();
|
||||||
|
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
|
||||||
|
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
|
||||||
|
|
||||||
|
if (_compiled) {
|
||||||
|
k_pair_fast.clear();
|
||||||
|
k_pair.clear();
|
||||||
|
delete pair_program;
|
||||||
|
_compiled=false;
|
||||||
|
}
|
||||||
|
|
||||||
|
time_pair.clear();
|
||||||
|
hd_balancer.clear();
|
||||||
|
|
||||||
|
nbor->clear();
|
||||||
|
ans->clear();
|
||||||
|
device->clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Copy neighbor list from host
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int * BaseDipoleT::reset_nbors(const int nall, const int inum, int *ilist,
|
||||||
|
int *numj, int **firstneigh, bool &success) {
|
||||||
|
success=true;
|
||||||
|
|
||||||
|
int mn=nbor->max_nbor_loop(inum,numj,ilist);
|
||||||
|
resize_atom(inum,nall,success);
|
||||||
|
resize_local(inum,mn,success);
|
||||||
|
if (!success)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
|
||||||
|
|
||||||
|
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||||
|
if (bytes>_max_an_bytes)
|
||||||
|
_max_an_bytes=bytes;
|
||||||
|
|
||||||
|
return ilist;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Build neighbor list on device
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum,
|
||||||
|
const int nall, double **host_x,
|
||||||
|
int *host_type, double *sublo,
|
||||||
|
double *subhi, int *tag,
|
||||||
|
int **nspecial, int **special,
|
||||||
|
bool &success) {
|
||||||
|
success=true;
|
||||||
|
resize_atom(inum,nall,success);
|
||||||
|
resize_local(inum,host_inum,nbor->max_nbors(),success);
|
||||||
|
if (!success)
|
||||||
|
return;
|
||||||
|
atom->cast_copy_x(host_x,host_type);
|
||||||
|
|
||||||
|
int mn;
|
||||||
|
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
|
||||||
|
nspecial, special, success, mn);
|
||||||
|
|
||||||
|
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||||
|
if (bytes>_max_an_bytes)
|
||||||
|
_max_an_bytes=bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void BaseDipoleT::compute(const int f_ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type,
|
||||||
|
int *ilist, int *numj, int **firstneigh,
|
||||||
|
const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom,
|
||||||
|
int &host_start, const double cpu_time,
|
||||||
|
bool &success, double *host_q, double **host_mu,
|
||||||
|
const int nlocal, double *boxlo, double *prd) {
|
||||||
|
acc_timers();
|
||||||
|
if (inum_full==0) {
|
||||||
|
host_start=0;
|
||||||
|
// Make sure textures are correct if realloc by a different hybrid style
|
||||||
|
resize_atom(0,nall,success);
|
||||||
|
zero_timers();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ago=hd_balancer.ago_first(f_ago);
|
||||||
|
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
|
||||||
|
ans->inum(inum);
|
||||||
|
host_start=inum;
|
||||||
|
|
||||||
|
if (ago==0) {
|
||||||
|
reset_nbors(nall, inum, ilist, numj, firstneigh, success);
|
||||||
|
if (!success)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
atom->cast_x_data(host_x,host_type);
|
||||||
|
atom->cast_q_data(host_q);
|
||||||
|
atom->cast_quat_data(host_mu[0]);
|
||||||
|
hd_balancer.start_timer();
|
||||||
|
atom->add_x_data(host_x,host_type);
|
||||||
|
atom->add_q_data();
|
||||||
|
atom->add_quat_data();
|
||||||
|
|
||||||
|
device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
|
||||||
|
boxlo, prd);
|
||||||
|
|
||||||
|
loop(eflag,vflag);
|
||||||
|
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||||
|
device->add_ans_object(ans);
|
||||||
|
hd_balancer.stop_timer();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int** BaseDipoleT::compute(const int ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type,
|
||||||
|
double *sublo, double *subhi, int *tag,
|
||||||
|
int **nspecial, int **special, const bool eflag,
|
||||||
|
const bool vflag, const bool eatom,
|
||||||
|
const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **jnum,
|
||||||
|
const double cpu_time, bool &success,
|
||||||
|
double *host_q, double **host_mu,
|
||||||
|
double *boxlo, double *prd) {
|
||||||
|
acc_timers();
|
||||||
|
if (inum_full==0) {
|
||||||
|
host_start=0;
|
||||||
|
// Make sure textures are correct if realloc by a different hybrid style
|
||||||
|
resize_atom(0,nall,success);
|
||||||
|
zero_timers();
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
hd_balancer.balance(cpu_time);
|
||||||
|
int inum=hd_balancer.get_gpu_count(ago,inum_full);
|
||||||
|
ans->inum(inum);
|
||||||
|
host_start=inum;
|
||||||
|
|
||||||
|
// Build neighbor list on GPU if necessary
|
||||||
|
if (ago==0) {
|
||||||
|
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||||
|
sublo, subhi, tag, nspecial, special, success);
|
||||||
|
if (!success)
|
||||||
|
return NULL;
|
||||||
|
atom->cast_q_data(host_q);
|
||||||
|
atom->cast_quat_data(host_mu[0]);
|
||||||
|
hd_balancer.start_timer();
|
||||||
|
} else {
|
||||||
|
atom->cast_x_data(host_x,host_type);
|
||||||
|
atom->cast_q_data(host_q);
|
||||||
|
atom->cast_quat_data(host_mu[0]);
|
||||||
|
hd_balancer.start_timer();
|
||||||
|
atom->add_x_data(host_x,host_type);
|
||||||
|
}
|
||||||
|
atom->add_q_data();
|
||||||
|
atom->add_quat_data();
|
||||||
|
*ilist=nbor->host_ilist.begin();
|
||||||
|
*jnum=nbor->host_acc.begin();
|
||||||
|
|
||||||
|
device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
|
||||||
|
boxlo, prd);
|
||||||
|
|
||||||
|
loop(eflag,vflag);
|
||||||
|
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||||
|
device->add_ans_object(ans);
|
||||||
|
hd_balancer.stop_timer();
|
||||||
|
|
||||||
|
return nbor->host_jlist.begin()-host_start;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
double BaseDipoleT::host_memory_usage_atomic() const {
|
||||||
|
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
|
||||||
|
4*sizeof(numtyp)+sizeof(BaseDipole<numtyp,acctyp>);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||||
|
const char *kname) {
|
||||||
|
if (_compiled)
|
||||||
|
return;
|
||||||
|
|
||||||
|
std::string s_fast=std::string(kname)+"_fast";
|
||||||
|
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
|
||||||
|
std::string(OCL_PRECISION_COMPILE)+" -D"+
|
||||||
|
std::string(OCL_VENDOR);
|
||||||
|
|
||||||
|
pair_program=new UCL_Program(dev);
|
||||||
|
pair_program->load_string(pair_str,flags.c_str());
|
||||||
|
k_pair_fast.set_function(*pair_program,s_fast.c_str());
|
||||||
|
k_pair.set_function(*pair_program,kname);
|
||||||
|
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||||
|
q_tex.get_texture(*pair_program,"q_tex");
|
||||||
|
mu_tex.get_texture(*pair_program,"mu_tex");
|
||||||
|
|
||||||
|
_compiled=true;
|
||||||
|
}
|
||||||
|
|
||||||
|
template class BaseDipole<PRECISION,ACC_PRECISION>;
|
||||||
|
|
|
@ -0,0 +1,200 @@
|
||||||
|
/***************************************************************************
|
||||||
|
base_dipole.h
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Base class for pair styles needing per-particle data for position,
|
||||||
|
dipole, and type.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef LAL_BASE_DIPOLE_H
|
||||||
|
#define LAL_BASE_DIPOLE_H
|
||||||
|
|
||||||
|
#include "lal_device.h"
|
||||||
|
#include "lal_balance.h"
|
||||||
|
#include "mpi.h"
|
||||||
|
|
||||||
|
#ifdef USE_OPENCL
|
||||||
|
#include "geryon/ocl_texture.h"
|
||||||
|
#else
|
||||||
|
#include "geryon/nvd_texture.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace LAMMPS_AL {
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
class BaseDipole {
|
||||||
|
public:
|
||||||
|
BaseDipole();
|
||||||
|
virtual ~BaseDipole();
|
||||||
|
|
||||||
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
|
* \param cell_size cutoff + skin
|
||||||
|
* \param gpu_split fraction of particles handled by device
|
||||||
|
* \param k_name name for the kernel for force calculation
|
||||||
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *screen,
|
||||||
|
const void *pair_program, const char *k_name);
|
||||||
|
|
||||||
|
/// Estimate the overhead for GPU context changes and CPU driver
|
||||||
|
void estimate_gpu_overhead();
|
||||||
|
|
||||||
|
/// Check if there is enough storage for atom arrays and realloc if not
|
||||||
|
/** \param success set to false if insufficient memory **/
|
||||||
|
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||||
|
if (atom->resize(nall, success)) {
|
||||||
|
pos_tex.bind_float(atom->x,4);
|
||||||
|
q_tex.bind_float(atom->q,1);
|
||||||
|
mu_tex.bind_float(atom->quat,4);
|
||||||
|
}
|
||||||
|
ans->resize(inum,success);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if there is enough storage for neighbors and realloc if not
|
||||||
|
/** \param nlocal number of particles whose nbors must be stored on device
|
||||||
|
* \param host_inum number of particles whose nbors need to copied to host
|
||||||
|
* \param current maximum number of neighbors
|
||||||
|
* \note olist_size=total number of local particles **/
|
||||||
|
inline void resize_local(const int inum, const int max_nbors, bool &success) {
|
||||||
|
nbor->resize(inum,max_nbors,success);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if there is enough storage for neighbors and realloc if not
|
||||||
|
/** \param nlocal number of particles whose nbors must be stored on device
|
||||||
|
* \param host_inum number of particles whose nbors need to copied to host
|
||||||
|
* \param current maximum number of neighbors
|
||||||
|
* \note host_inum is 0 if the host is performing neighboring
|
||||||
|
* \note nlocal+host_inum=total number local particles
|
||||||
|
* \note olist_size=0 **/
|
||||||
|
inline void resize_local(const int inum, const int host_inum,
|
||||||
|
const int max_nbors, bool &success) {
|
||||||
|
nbor->resize(inum,host_inum,max_nbors,success);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clear all host and device data
|
||||||
|
/** \note This is called at the beginning of the init() routine **/
|
||||||
|
void clear_atomic();
|
||||||
|
|
||||||
|
/// Returns memory usage on device per atom
|
||||||
|
int bytes_per_atom_atomic(const int max_nbors) const;
|
||||||
|
|
||||||
|
/// Total host memory used by library for pair style
|
||||||
|
double host_memory_usage_atomic() const;
|
||||||
|
|
||||||
|
/// Accumulate timers
|
||||||
|
inline void acc_timers() {
|
||||||
|
if (device->time_device()) {
|
||||||
|
nbor->acc_timers();
|
||||||
|
time_pair.add_to_total();
|
||||||
|
atom->acc_timers();
|
||||||
|
ans->acc_timers();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Zero timers
|
||||||
|
inline void zero_timers() {
|
||||||
|
time_pair.zero();
|
||||||
|
atom->zero_timers();
|
||||||
|
ans->zero_timers();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Copy neighbor list from host
|
||||||
|
int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
|
||||||
|
int **firstneigh, bool &success);
|
||||||
|
|
||||||
|
/// Build neighbor list on device
|
||||||
|
void build_nbor_list(const int inum, const int host_inum,
|
||||||
|
const int nall, double **host_x, int *host_type,
|
||||||
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
|
int **special, bool &success);
|
||||||
|
|
||||||
|
/// Pair loop with host neighboring
|
||||||
|
void compute(const int f_ago, const int inum_full, const int nall,
|
||||||
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
const double cpu_time, bool &success, double *charge,
|
||||||
|
double **mu, const int nlocal, double *boxlo, double *prd);
|
||||||
|
|
||||||
|
/// Pair loop with device neighboring
|
||||||
|
int** compute(const int ago, const int inum_full, const int nall,
|
||||||
|
double **host_x, int *host_type, double *sublo,
|
||||||
|
double *subhi, int *tag, int **nspecial,
|
||||||
|
int **special, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||||
|
double *charge, double **mu, double *boxlo, double *prd);
|
||||||
|
|
||||||
|
// -------------------------- DEVICE DATA -------------------------
|
||||||
|
|
||||||
|
/// Device Properties and Atom and Neighbor storage
|
||||||
|
Device<numtyp,acctyp> *device;
|
||||||
|
|
||||||
|
/// Geryon device
|
||||||
|
UCL_Device *ucl_device;
|
||||||
|
|
||||||
|
/// Device Timers
|
||||||
|
UCL_Timer time_pair;
|
||||||
|
|
||||||
|
/// Host device load balancer
|
||||||
|
Balance<numtyp,acctyp> hd_balancer;
|
||||||
|
|
||||||
|
/// LAMMPS pointer for screen output
|
||||||
|
FILE *screen;
|
||||||
|
|
||||||
|
// --------------------------- ATOM DATA --------------------------
|
||||||
|
|
||||||
|
/// Atom Data
|
||||||
|
Atom<numtyp,acctyp> *atom;
|
||||||
|
|
||||||
|
|
||||||
|
// ------------------------ FORCE/ENERGY DATA -----------------------
|
||||||
|
|
||||||
|
Answer<numtyp,acctyp> *ans;
|
||||||
|
|
||||||
|
// --------------------------- NBOR DATA ----------------------------
|
||||||
|
|
||||||
|
/// Neighbor data
|
||||||
|
Neighbor *nbor;
|
||||||
|
|
||||||
|
// ------------------------- DEVICE KERNELS -------------------------
|
||||||
|
UCL_Program *pair_program;
|
||||||
|
UCL_Kernel k_pair_fast, k_pair;
|
||||||
|
inline int block_size() { return _block_size; }
|
||||||
|
|
||||||
|
// --------------------------- TEXTURES -----------------------------
|
||||||
|
UCL_Texture pos_tex;
|
||||||
|
UCL_Texture q_tex;
|
||||||
|
UCL_Texture mu_tex;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
bool _compiled;
|
||||||
|
int _block_size, _block_bio_size, _threads_per_atom;
|
||||||
|
double _max_bytes, _max_an_bytes;
|
||||||
|
double _gpu_overhead, _driver_overhead;
|
||||||
|
UCL_D_Vec<int> *_nbor_data;
|
||||||
|
|
||||||
|
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
|
||||||
|
|
||||||
|
virtual void loop(const bool _eflag, const bool _vflag) = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,162 @@
|
||||||
|
/***************************************************************************
|
||||||
|
born.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the born pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef USE_OPENCL
|
||||||
|
#include "born_cl.h"
|
||||||
|
#elif defined(USE_CUDART)
|
||||||
|
const char *born=0;
|
||||||
|
#else
|
||||||
|
#include "born_cubin.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "lal_born.h"
|
||||||
|
#include <cassert>
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
#define BornT Born<numtyp, acctyp>
|
||||||
|
|
||||||
|
extern Device<PRECISION,ACC_PRECISION> device;
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
BornT::Born() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
BornT::~Born() {
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int BornT::bytes_per_atom(const int max_nbors) const {
|
||||||
|
return this->bytes_per_atom_atomic(max_nbors);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int BornT::init(const int ntypes, double **host_cutsq,
|
||||||
|
double **host_rhoinv, double **host_born1, double **host_born2,
|
||||||
|
double **host_born3, double **host_a, double **host_c,
|
||||||
|
double **host_d, double **host_sigma,
|
||||||
|
double **host_offset, double *host_special_lj,
|
||||||
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *_screen) {
|
||||||
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
|
_screen,born,"k_born");
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
|
int lj_types=ntypes;
|
||||||
|
shared_types=false;
|
||||||
|
int max_shared_types=this->device->max_shared_types();
|
||||||
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
|
shared_types=true;
|
||||||
|
}
|
||||||
|
_lj_types=lj_types;
|
||||||
|
|
||||||
|
// Allocate a host write buffer for data initialization
|
||||||
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
|
UCL_WRITE_OPTIMIZED);
|
||||||
|
|
||||||
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
|
host_write[i]=0.0;
|
||||||
|
|
||||||
|
coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv,
|
||||||
|
host_born1,host_born2,host_born3);
|
||||||
|
|
||||||
|
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
|
||||||
|
host_d,host_offset);
|
||||||
|
|
||||||
|
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack2(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
|
||||||
|
host_sigma);
|
||||||
|
|
||||||
|
UCL_H_Vec<double> dview;
|
||||||
|
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
dview.view(host_special_lj,4,*(this->ucl_device));
|
||||||
|
ucl_copy(sp_lj,dview,false);
|
||||||
|
|
||||||
|
_allocated=true;
|
||||||
|
this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()
|
||||||
|
+cutsq_sigma.row_bytes()+sp_lj.row_bytes();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void BornT::clear() {
|
||||||
|
if (!_allocated)
|
||||||
|
return;
|
||||||
|
_allocated=false;
|
||||||
|
|
||||||
|
coeff1.clear();
|
||||||
|
coeff2.clear();
|
||||||
|
cutsq_sigma.clear();
|
||||||
|
sp_lj.clear();
|
||||||
|
this->clear_atomic();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
double BornT::host_memory_usage() const {
|
||||||
|
return this->host_memory_usage_atomic()+sizeof(Born<numtyp,acctyp>);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Calculate energies, forces, and torques
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void BornT::loop(const bool _eflag, const bool _vflag) {
|
||||||
|
// Compute the block size and grid size to keep all cores busy
|
||||||
|
const int BX=this->block_size();
|
||||||
|
int eflag, vflag;
|
||||||
|
if (_eflag)
|
||||||
|
eflag=1;
|
||||||
|
else
|
||||||
|
eflag=0;
|
||||||
|
|
||||||
|
if (_vflag)
|
||||||
|
vflag=1;
|
||||||
|
else
|
||||||
|
vflag=0;
|
||||||
|
|
||||||
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
|
int ainum=this->ans->inum();
|
||||||
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
|
this->time_pair.start();
|
||||||
|
if (shared_types) {
|
||||||
|
this->k_pair_fast.set_size(GX,BX);
|
||||||
|
this->k_pair_fast.run(&this->atom->x, &coeff1,&coeff2,
|
||||||
|
&cutsq_sigma, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor,
|
||||||
|
&this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||||
|
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||||
|
} else {
|
||||||
|
this->k_pair.set_size(GX,BX);
|
||||||
|
this->k_pair.run(&this->atom->x, &coeff1, &coeff2,
|
||||||
|
&cutsq_sigma, &_lj_types, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor,
|
||||||
|
&this->_nbor_data->begin(), &this->ans->force,
|
||||||
|
&this->ans->engv, &eflag, &vflag, &ainum,
|
||||||
|
&nbor_pitch, &this->_threads_per_atom);
|
||||||
|
}
|
||||||
|
this->time_pair.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
template class Born<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,201 @@
|
||||||
|
// **************************************************************************
|
||||||
|
// born.cu
|
||||||
|
// -------------------
|
||||||
|
// Trung Dac Nguyen (ORNL)
|
||||||
|
//
|
||||||
|
// Device code for acceleration of the born pair style
|
||||||
|
//
|
||||||
|
// __________________________________________________________________________
|
||||||
|
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
// __________________________________________________________________________
|
||||||
|
//
|
||||||
|
// begin :
|
||||||
|
// email : nguyentd@ornl.gov
|
||||||
|
// ***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef NV_KERNEL
|
||||||
|
#include "lal_aux_fun1.h"
|
||||||
|
#ifndef _DOUBLE_DOUBLE
|
||||||
|
texture<float4> pos_tex;
|
||||||
|
#else
|
||||||
|
texture<int4,1> pos_tex;
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define pos_tex x_
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__kernel void k_born(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||||
|
__global numtyp4* coeff2,
|
||||||
|
__global numtyp2 *cutsq_sigma,
|
||||||
|
const int lj_types, __global numtyp *sp_lj_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
|
const int eflag, const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp sp_lj[4];
|
||||||
|
sp_lj[0]=sp_lj_in[0];
|
||||||
|
sp_lj[1]=sp_lj_in[1];
|
||||||
|
sp_lj[2]=sp_lj_in[2];
|
||||||
|
sp_lj[3]=sp_lj_in[3];
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
int itype=ix.w;
|
||||||
|
|
||||||
|
numtyp factor_lj;
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
|
int j=*nbor;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
int jtype=jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
int mtype=itype*lj_types+jtype;
|
||||||
|
if (r2inv<cutsq_sigma[mtype].x) {
|
||||||
|
numtyp r=ucl_sqrt(r2inv);
|
||||||
|
numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
|
||||||
|
r2inv=ucl_recip(r2inv);
|
||||||
|
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||||
|
numtyp force = r2inv*(coeff1[mtype].y*r*rexp
|
||||||
|
- coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
|
||||||
|
force*=factor_lj;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
|
||||||
|
+ coeff2[mtype].z*r2inv*r6inv;
|
||||||
|
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||||
|
ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void k_born_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
|
||||||
|
__global numtyp4* coeff2_in,
|
||||||
|
__global numtyp2 *cutsq_sigma,
|
||||||
|
__global numtyp* sp_lj_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
|
const int eflag, const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp sp_lj[4];
|
||||||
|
if (tid<4)
|
||||||
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
|
coeff1[tid]=coeff1_in[tid];
|
||||||
|
if (eflag>0)
|
||||||
|
coeff2[tid]=coeff2_in[tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
int iw=ix.w;
|
||||||
|
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
|
numtyp factor_lj;
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
|
int j=*nbor;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
int mtype=itype+jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
if (r2inv<cutsq_sigma[mtype].x) {
|
||||||
|
numtyp r=ucl_sqrt(r2inv);
|
||||||
|
numtyp rexp = ucl_exp((cutsq_sigma[mtype].y-r)*coeff1[mtype].x);
|
||||||
|
r2inv=ucl_recip(r2inv);
|
||||||
|
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||||
|
numtyp force = r2inv*(coeff1[mtype].y*r*rexp
|
||||||
|
- coeff1[mtype].z*r6inv + coeff1[mtype].w*r2inv*r6inv);
|
||||||
|
force*=factor_lj;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
|
||||||
|
+ coeff2[mtype].z*r2inv*r6inv;
|
||||||
|
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||||
|
ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,84 @@
|
||||||
|
/***************************************************************************
|
||||||
|
born.h
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the born pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef LAL_BORN_H
|
||||||
|
#define LAL_BORN_H
|
||||||
|
|
||||||
|
#include "lal_base_atomic.h"
|
||||||
|
|
||||||
|
namespace LAMMPS_AL {
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
class Born : public BaseAtomic<numtyp, acctyp> {
|
||||||
|
public:
|
||||||
|
Born();
|
||||||
|
~Born();
|
||||||
|
|
||||||
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
|
* \param cell_size cutoff + skin
|
||||||
|
* \param gpu_split fraction of particles handled by device
|
||||||
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double **host_cutsq,
|
||||||
|
double **host_rhoinv, double **host_born1, double **host_born2,
|
||||||
|
double **host_born3, double **host_a, double **host_c,
|
||||||
|
double **host_d, double **host_sigma,
|
||||||
|
double **host_offset, double *host_special_lj,
|
||||||
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *screen);
|
||||||
|
|
||||||
|
/// Clear all host and device data
|
||||||
|
/** \note This is called at the beginning of the init() routine **/
|
||||||
|
void clear();
|
||||||
|
|
||||||
|
/// Returns memory usage on device per atom
|
||||||
|
int bytes_per_atom(const int max_nbors) const;
|
||||||
|
|
||||||
|
/// Total host memory used by library for pair style
|
||||||
|
double host_memory_usage() const;
|
||||||
|
|
||||||
|
// --------------------------- TYPE DATA --------------------------
|
||||||
|
|
||||||
|
/// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
|
||||||
|
/// coeff1.w = born3
|
||||||
|
UCL_D_Vec<numtyp4> coeff1;
|
||||||
|
/// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
|
||||||
|
UCL_D_Vec<numtyp4> coeff2;
|
||||||
|
/// cutsq_sigma
|
||||||
|
UCL_D_Vec<numtyp2> cutsq_sigma;
|
||||||
|
/// Special LJ values
|
||||||
|
UCL_D_Vec<numtyp> sp_lj;
|
||||||
|
|
||||||
|
/// If atom type constants fit in shared memory, use fast kernels
|
||||||
|
bool shared_types;
|
||||||
|
|
||||||
|
/// Number of atom types
|
||||||
|
int _lj_types;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool _allocated;
|
||||||
|
void loop(const bool _eflag, const bool _vflag);
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,175 @@
|
||||||
|
/***************************************************************************
|
||||||
|
born_coul_long.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the born/coul/long pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef USE_OPENCL
|
||||||
|
#include "born_coul_long_cl.h"
|
||||||
|
#elif defined(USE_CUDART)
|
||||||
|
const char *born_coul_long=0;
|
||||||
|
#else
|
||||||
|
#include "born_coul_long_cubin.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "lal_born_coul_long.h"
|
||||||
|
#include <cassert>
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
#define BornCoulLongT BornCoulLong<numtyp, acctyp>
|
||||||
|
|
||||||
|
extern Device<PRECISION,ACC_PRECISION> device;
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
BornCoulLongT::BornCoulLong() : BaseCharge<numtyp,acctyp>(),
|
||||||
|
_allocated(false) {
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
BornCoulLongT::~BornCoulLongT() {
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int BornCoulLongT::bytes_per_atom(const int max_nbors) const {
|
||||||
|
return this->bytes_per_atom_atomic(max_nbors);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
|
||||||
|
double **host_born1, double **host_born2, double **host_born3,
|
||||||
|
double **host_a, double **host_c, double **host_d,
|
||||||
|
double **host_sigma, double **host_offset,
|
||||||
|
double *host_special_lj, const int nlocal,
|
||||||
|
const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *_screen,
|
||||||
|
double **host_cut_ljsq, const double host_cut_coulsq,
|
||||||
|
double *host_special_coul, const double qqrd2e,
|
||||||
|
const double g_ewald) {
|
||||||
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
|
_screen,born_coul_long,"k_born_long");
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
|
int lj_types=ntypes;
|
||||||
|
shared_types=false;
|
||||||
|
int max_shared_types=this->device->max_shared_types();
|
||||||
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
|
shared_types=true;
|
||||||
|
}
|
||||||
|
_lj_types=lj_types;
|
||||||
|
|
||||||
|
// Allocate a host write buffer for data initialization
|
||||||
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
|
UCL_WRITE_OPTIMIZED);
|
||||||
|
|
||||||
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
|
host_write[i]=0.0;
|
||||||
|
|
||||||
|
coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv,
|
||||||
|
host_born1,host_born2,host_born3);
|
||||||
|
|
||||||
|
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
|
||||||
|
host_d,host_offset);
|
||||||
|
|
||||||
|
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
|
||||||
|
host_cut_ljsq,host_sigma);
|
||||||
|
|
||||||
|
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
for (int i=0; i<4; i++) {
|
||||||
|
host_write[i]=host_special_lj[i];
|
||||||
|
host_write[i+4]=host_special_coul[i];
|
||||||
|
}
|
||||||
|
ucl_copy(sp_lj,host_write,8,false);
|
||||||
|
|
||||||
|
_cut_coulsq=host_cut_coulsq;
|
||||||
|
_qqrd2e=qqrd2e;
|
||||||
|
_g_ewald=g_ewald;
|
||||||
|
|
||||||
|
_allocated=true;
|
||||||
|
this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()
|
||||||
|
+cutsq_sigma.row_bytes()+sp_lj.row_bytes();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void BornCoulLongT::clear() {
|
||||||
|
if (!_allocated)
|
||||||
|
return;
|
||||||
|
_allocated=false;
|
||||||
|
|
||||||
|
coeff1.clear();
|
||||||
|
coeff2.clear();
|
||||||
|
cutsq_sigma.clear();
|
||||||
|
sp_lj.clear();
|
||||||
|
this->clear_atomic();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
double BornCoulLongT::host_memory_usage() const {
|
||||||
|
return this->host_memory_usage_atomic()+sizeof(BornCoulLong<numtyp,acctyp>);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Calculate energies, forces, and torques
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
|
||||||
|
// Compute the block size and grid size to keep all cores busy
|
||||||
|
const int BX=this->block_size();
|
||||||
|
int eflag, vflag;
|
||||||
|
if (_eflag)
|
||||||
|
eflag=1;
|
||||||
|
else
|
||||||
|
eflag=0;
|
||||||
|
|
||||||
|
if (_vflag)
|
||||||
|
vflag=1;
|
||||||
|
else
|
||||||
|
vflag=0;
|
||||||
|
|
||||||
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
|
int ainum=this->ans->inum();
|
||||||
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
|
this->time_pair.start();
|
||||||
|
if (shared_types) {
|
||||||
|
this->k_pair_fast.set_size(GX,BX);
|
||||||
|
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor,
|
||||||
|
&this->_nbor_data->begin(),
|
||||||
|
&this->ans->force,
|
||||||
|
&this->ans->engv, &eflag, &vflag,
|
||||||
|
&ainum, &nbor_pitch, &this->atom->q,
|
||||||
|
&cutsq_sigma, &_cut_coulsq, &_qqrd2e,
|
||||||
|
&_g_ewald, &this->_threads_per_atom);
|
||||||
|
} else {
|
||||||
|
this->k_pair.set_size(GX,BX);
|
||||||
|
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv,
|
||||||
|
&eflag, &vflag, &ainum,
|
||||||
|
&nbor_pitch, &this->atom->q,
|
||||||
|
&cutsq_sigma, &_cut_coulsq,
|
||||||
|
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||||
|
}
|
||||||
|
this->time_pair.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
template class BornCoulLong<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,262 @@
|
||||||
|
// **************************************************************************
|
||||||
|
// buck_coul_long.cu
|
||||||
|
// -------------------
|
||||||
|
// Trung Dac Nguyen (ORNL)
|
||||||
|
//
|
||||||
|
// Device code for acceleration of the buck/coul/long pair style
|
||||||
|
//
|
||||||
|
// __________________________________________________________________________
|
||||||
|
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
// __________________________________________________________________________
|
||||||
|
//
|
||||||
|
// begin :
|
||||||
|
// email : nguyentd@ornl.gov
|
||||||
|
// ***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef NV_KERNEL
|
||||||
|
|
||||||
|
#include "lal_aux_fun1.h"
|
||||||
|
#ifndef _DOUBLE_DOUBLE
|
||||||
|
texture<float4> pos_tex;
|
||||||
|
texture<float> q_tex;
|
||||||
|
#else
|
||||||
|
texture<int4,1> pos_tex;
|
||||||
|
texture<int2> q_tex;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define pos_tex x_
|
||||||
|
#define q_tex q_
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__kernel void k_born_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||||
|
__global numtyp4* coeff2, const int lj_types,
|
||||||
|
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||||
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
|
__global acctyp *engv, const int eflag,
|
||||||
|
const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, __global numtyp *q_,
|
||||||
|
__global numtyp4 *cutsq_sigma,
|
||||||
|
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||||
|
const numtyp g_ewald, const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp sp_lj[8];
|
||||||
|
sp_lj[0]=sp_lj_in[0];
|
||||||
|
sp_lj[1]=sp_lj_in[1];
|
||||||
|
sp_lj[2]=sp_lj_in[2];
|
||||||
|
sp_lj[3]=sp_lj_in[3];
|
||||||
|
sp_lj[4]=sp_lj_in[4];
|
||||||
|
sp_lj[5]=sp_lj_in[5];
|
||||||
|
sp_lj[6]=sp_lj_in[6];
|
||||||
|
sp_lj[7]=sp_lj_in[7];
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp e_coul=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||||
|
int itype=ix.w;
|
||||||
|
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
int j=*nbor;
|
||||||
|
|
||||||
|
numtyp factor_lj, factor_coul;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
int jtype=jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
int mtype=itype*lj_types+jtype;
|
||||||
|
if (rsq<cutsq_sigma[mtype].x) { // cutsq
|
||||||
|
numtyp r2inv = ucl_recip(rsq);
|
||||||
|
numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
|
||||||
|
numtyp rexp = (numtyp)0.0;
|
||||||
|
|
||||||
|
if (rsq < cut_coulsq) {
|
||||||
|
numtyp r = ucl_rsqrt(r2inv);
|
||||||
|
numtyp grij = g_ewald * r;
|
||||||
|
numtyp expm2 = ucl_exp(-grij*grij);
|
||||||
|
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||||
|
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||||
|
fetch(prefactor,j,q_tex);
|
||||||
|
prefactor *= qqrd2e * qtmp/r;
|
||||||
|
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||||
|
} else forcecoul = (numtyp)0.0;
|
||||||
|
|
||||||
|
if (rsq < cutsq_sigma[mtype].y) {
|
||||||
|
numtyp r = ucl_sqrt(rsq);
|
||||||
|
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
|
||||||
|
r6inv = r2inv*r2inv*r2inv;
|
||||||
|
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
|
||||||
|
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj;
|
||||||
|
} else forceborn = (numtyp)0.0;
|
||||||
|
|
||||||
|
force = (forceborn + forcecoul) * r2inv;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
if (rsq < cut_coulsq)
|
||||||
|
e_coul += prefactor*(_erfc-factor_coul);
|
||||||
|
if (rsq < coeff1[mtype].w) {
|
||||||
|
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
|
||||||
|
+ coeff2[mtype].z*r2inv*r6inv;
|
||||||
|
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||||
|
vflag,ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void k_born_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
|
||||||
|
__global numtyp4* coeff2_in,
|
||||||
|
__global numtyp* sp_lj_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
|
const int eflag, const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, __global numtyp *q_,
|
||||||
|
__global numtyp4 *cutsq_sigma,
|
||||||
|
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||||
|
const numtyp g_ewald, const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp sp_lj[8];
|
||||||
|
if (tid<8)
|
||||||
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
|
coeff1[tid]=coeff1_in[tid];
|
||||||
|
if (eflag>0)
|
||||||
|
coeff2[tid]=coeff2_in[tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp e_coul=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||||
|
int iw=ix.w;
|
||||||
|
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
int j=*nbor;
|
||||||
|
|
||||||
|
numtyp factor_lj, factor_coul;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
int mtype=itype+jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
if (rsq<cutsq_sigma[mtype].x) {
|
||||||
|
numtyp r2inv=ucl_recip(rsq);
|
||||||
|
numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
|
||||||
|
numtyp rexp = (numtyp)0.0;
|
||||||
|
|
||||||
|
if (rsq < cut_coulsq) {
|
||||||
|
numtyp r=ucl_rsqrt(r2inv);
|
||||||
|
numtyp grij = g_ewald * r;
|
||||||
|
numtyp expm2 = ucl_exp(-grij*grij);
|
||||||
|
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
|
||||||
|
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||||
|
fetch(prefactor,j,q_tex);
|
||||||
|
prefactor *= qqrd2e * qtmp/r;
|
||||||
|
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
|
||||||
|
} else forcecoul = (numtyp)0.0;
|
||||||
|
|
||||||
|
if (rsq < cutsq_sigma[mtype].y) {
|
||||||
|
numtyp r = ucl_sqrt(rsq);
|
||||||
|
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
|
||||||
|
r6inv = r2inv*r2inv*r2inv;
|
||||||
|
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
|
||||||
|
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj;
|
||||||
|
} else forceborn = (numtyp)0.0;
|
||||||
|
|
||||||
|
force = (forceborn + forcecoul) * r2inv;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
if (rsq < cut_coulsq)
|
||||||
|
e_coul += prefactor*(_erfc-factor_coul);
|
||||||
|
if (rsq < coeff1[mtype].w) {
|
||||||
|
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
|
||||||
|
+ coeff2[mtype].z*r2inv*r6inv;
|
||||||
|
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||||
|
vflag,ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,88 @@
|
||||||
|
/***************************************************************************
|
||||||
|
born_coul_long.h
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the born/coul/long pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef LAL_BORN_COUL_LONG_H
|
||||||
|
#define LAL_BORN_COUL_LONG_H
|
||||||
|
|
||||||
|
#include "lal_base_charge.h"
|
||||||
|
|
||||||
|
namespace LAMMPS_AL {
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
class BornCoulLong : public BaseCharge<numtyp, acctyp> {
|
||||||
|
public:
|
||||||
|
BornCoulLong();
|
||||||
|
~BornCoulLong();
|
||||||
|
|
||||||
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
|
* \param cell_size cutoff + skin
|
||||||
|
* \param gpu_split fraction of particles handled by device
|
||||||
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
|
||||||
|
double **host_born1, double **host_born2, double **host_born3,
|
||||||
|
double **host_a, double **host_c, double **host_d,
|
||||||
|
double **host_sigma, double **host_offset, double *host_special_lj,
|
||||||
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||||
|
const double host_cut_coulsq, double *host_special_coul,
|
||||||
|
const double qqrd2e, const double g_ewald);
|
||||||
|
|
||||||
|
/// Clear all host and device data
|
||||||
|
/** \note This is called at the beginning of the init() routine **/
|
||||||
|
void clear();
|
||||||
|
|
||||||
|
/// Returns memory usage on device per atom
|
||||||
|
int bytes_per_atom(const int max_nbors) const;
|
||||||
|
|
||||||
|
/// Total host memory used by library for pair style
|
||||||
|
double host_memory_usage() const;
|
||||||
|
|
||||||
|
// --------------------------- TYPE DATA --------------------------
|
||||||
|
|
||||||
|
/// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
|
||||||
|
/// coeff1.w = born3
|
||||||
|
UCL_D_Vec<numtyp4> coeff1;
|
||||||
|
/// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
|
||||||
|
UCL_D_Vec<numtyp4> coeff2;
|
||||||
|
/// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj,
|
||||||
|
/// cutsq_sigma.z = sigma
|
||||||
|
UCL_D_Vec<numtyp4> cutsq_sigma;
|
||||||
|
/// Special LJ values [0-3] and Special Coul values [4-7]
|
||||||
|
UCL_D_Vec<numtyp> sp_lj;
|
||||||
|
|
||||||
|
/// If atom type constants fit in shared memory, use fast kernels
|
||||||
|
bool shared_types;
|
||||||
|
|
||||||
|
/// Number of atom types
|
||||||
|
int _lj_types;
|
||||||
|
|
||||||
|
numtyp _cut_coulsq, _qqrd2e, _g_ewald;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool _allocated;
|
||||||
|
void loop(const bool _eflag, const bool _vflag);
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,132 @@
|
||||||
|
/***************************************************************************
|
||||||
|
born_coul_long_ext.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Functions for LAMMPS access to born/coul/long acceleration routines.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <cassert>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#include "lal_born_coul_long.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
|
||||||
|
static BornCoulLong<PRECISION,ACC_PRECISION> BORNCLMF;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Allocate memory on host and device and copy constants to device
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||||
|
double **host_born1, double **host_born2, double **host_born3,
|
||||||
|
double **host_a, double **host_c, double **host_d,
|
||||||
|
double **sigma, double **offset, double *special_lj,
|
||||||
|
const int inum, const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||||
|
FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
|
||||||
|
double *host_special_coul, const double qqrd2e,
|
||||||
|
const double g_ewald) {
|
||||||
|
BORNCLMF.clear();
|
||||||
|
gpu_mode=BORNCLMF.device->gpu_mode();
|
||||||
|
double gpu_split=BORNCLMF.device->particle_split();
|
||||||
|
int first_gpu=BORNCLMF.device->first_device();
|
||||||
|
int last_gpu=BORNCLMF.device->last_device();
|
||||||
|
int world_me=BORNCLMF.device->world_me();
|
||||||
|
int gpu_rank=BORNCLMF.device->gpu_rank();
|
||||||
|
int procs_per_gpu=BORNCLMF.device->procs_per_gpu();
|
||||||
|
|
||||||
|
BORNCLMF.device->init_message(screen,"born/coul/long",first_gpu,last_gpu);
|
||||||
|
|
||||||
|
bool message=false;
|
||||||
|
if (BORNCLMF.device->replica_me()==0 && screen)
|
||||||
|
message=true;
|
||||||
|
|
||||||
|
if (message) {
|
||||||
|
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
|
||||||
|
int init_ok=0;
|
||||||
|
if (world_me==0)
|
||||||
|
init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||||
|
host_born3, host_a, host_c, host_d, sigma, offset,
|
||||||
|
special_lj, inum, nall, 300, maxspecial, cell_size,
|
||||||
|
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
|
||||||
|
host_special_coul, qqrd2e, g_ewald);
|
||||||
|
|
||||||
|
BORNCLMF.device->world_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
|
if (message) {
|
||||||
|
if (last_gpu-first_gpu==0)
|
||||||
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
|
else
|
||||||
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
|
last_gpu,i);
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
if (gpu_rank==i && world_me!=0)
|
||||||
|
init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||||
|
host_born3, host_a, host_c, host_d, sigma, offset,
|
||||||
|
special_lj, inum, nall, 300, maxspecial, cell_size,
|
||||||
|
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
|
||||||
|
host_special_coul, qqrd2e, g_ewald);
|
||||||
|
|
||||||
|
BORNCLMF.device->gpu_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
}
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"\n");
|
||||||
|
|
||||||
|
if (init_ok==0)
|
||||||
|
BORNCLMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
void borncl_gpu_clear() {
|
||||||
|
BORNCLMF.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
int** borncl_gpu_compute_n(const int ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type,
|
||||||
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
|
int **special, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
|
bool &success, double *host_q, double *boxlo,
|
||||||
|
double *prd) {
|
||||||
|
return BORNCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||||
|
host_q, boxlo, prd);
|
||||||
|
}
|
||||||
|
|
||||||
|
void borncl_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
const double cpu_time, bool &success, double *host_q,
|
||||||
|
const int nlocal, double *boxlo, double *prd) {
|
||||||
|
BORNCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||||
|
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
|
||||||
|
host_q,nlocal,boxlo,prd);
|
||||||
|
}
|
||||||
|
|
||||||
|
double borncl_gpu_bytes() {
|
||||||
|
return BORNCLMF.host_memory_usage();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,176 @@
|
||||||
|
/***************************************************************************
|
||||||
|
born_coul_wolf.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the born/coul/wolf pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef USE_OPENCL
|
||||||
|
#include "born_coul_wolf_cl.h"
|
||||||
|
#elif defined(USE_CUDART)
|
||||||
|
const char *born_coul_wolf=0;
|
||||||
|
#else
|
||||||
|
#include "born_coul_wolf_cubin.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "lal_born_coul_wolf.h"
|
||||||
|
#include <cassert>
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
#define BornCoulWolfT BornCoulWolf<numtyp, acctyp>
|
||||||
|
|
||||||
|
extern Device<PRECISION,ACC_PRECISION> device;
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
BornCoulWolfT::BornCoulWolf() : BaseCharge<numtyp,acctyp>(),
|
||||||
|
_allocated(false) {
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
BornCoulWolfT::~BornCoulWolfT() {
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int BornCoulWolfT::bytes_per_atom(const int max_nbors) const {
|
||||||
|
return this->bytes_per_atom_atomic(max_nbors);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
|
||||||
|
double **host_born1, double **host_born2, double **host_born3,
|
||||||
|
double **host_a, double **host_c, double **host_d,
|
||||||
|
double **host_sigma, double **host_offset,
|
||||||
|
double *host_special_lj, const int nlocal,
|
||||||
|
const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *_screen,
|
||||||
|
double **host_cut_ljsq, const double host_cut_coulsq,
|
||||||
|
double *host_special_coul, const double qqrd2e,
|
||||||
|
const double alf, const double e_shift, const double f_shift) {
|
||||||
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
|
_screen,born_coul_wolf,"k_born_wolf");
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
|
int lj_types=ntypes;
|
||||||
|
shared_types=false;
|
||||||
|
int max_shared_types=this->device->max_shared_types();
|
||||||
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
|
shared_types=true;
|
||||||
|
}
|
||||||
|
_lj_types=lj_types;
|
||||||
|
|
||||||
|
// Allocate a host write buffer for data initialization
|
||||||
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
|
UCL_WRITE_OPTIMIZED);
|
||||||
|
|
||||||
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
|
host_write[i]=0.0;
|
||||||
|
|
||||||
|
coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv,
|
||||||
|
host_born1,host_born2,host_born3);
|
||||||
|
|
||||||
|
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
|
||||||
|
host_d,host_offset);
|
||||||
|
|
||||||
|
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
|
||||||
|
host_cut_ljsq,host_sigma);
|
||||||
|
|
||||||
|
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
for (int i=0; i<4; i++) {
|
||||||
|
host_write[i]=host_special_lj[i];
|
||||||
|
host_write[i+4]=host_special_coul[i];
|
||||||
|
}
|
||||||
|
ucl_copy(sp_lj,host_write,8,false);
|
||||||
|
|
||||||
|
_cut_coulsq=host_cut_coulsq;
|
||||||
|
_qqrd2e=qqrd2e;
|
||||||
|
_alf=alf;
|
||||||
|
_e_shift=e_shift;
|
||||||
|
_f_shift=f_shift;
|
||||||
|
|
||||||
|
_allocated=true;
|
||||||
|
this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes()
|
||||||
|
+cutsq_sigma.row_bytes()+sp_lj.row_bytes();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void BornCoulWolfT::clear() {
|
||||||
|
if (!_allocated)
|
||||||
|
return;
|
||||||
|
_allocated=false;
|
||||||
|
|
||||||
|
coeff1.clear();
|
||||||
|
coeff2.clear();
|
||||||
|
cutsq_sigma.clear();
|
||||||
|
sp_lj.clear();
|
||||||
|
this->clear_atomic();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
double BornCoulWolfT::host_memory_usage() const {
|
||||||
|
return this->host_memory_usage_atomic()+sizeof(BornCoulWolf<numtyp,acctyp>);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Calculate energies, forces, and torques
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
|
||||||
|
// Compute the block size and grid size to keep all cores busy
|
||||||
|
const int BX=this->block_size();
|
||||||
|
int eflag, vflag;
|
||||||
|
if (_eflag)
|
||||||
|
eflag=1;
|
||||||
|
else
|
||||||
|
eflag=0;
|
||||||
|
|
||||||
|
if (_vflag)
|
||||||
|
vflag=1;
|
||||||
|
else
|
||||||
|
vflag=0;
|
||||||
|
|
||||||
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
|
int ainum=this->ans->inum();
|
||||||
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
|
this->time_pair.start();
|
||||||
|
if (shared_types) {
|
||||||
|
this->k_pair_fast.set_size(GX,BX);
|
||||||
|
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||||
|
&ainum, &nbor_pitch, &this->atom->q,
|
||||||
|
&cutsq_sigma, &_cut_coulsq, &_qqrd2e,
|
||||||
|
&_alf, &_e_shift, &_f_shift,
|
||||||
|
&this->_threads_per_atom);
|
||||||
|
} else {
|
||||||
|
this->k_pair.set_size(GX,BX);
|
||||||
|
this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
|
||||||
|
&nbor_pitch, &this->atom->q,
|
||||||
|
&cutsq_sigma, &_cut_coulsq,
|
||||||
|
&_qqrd2e, &_alf, &_e_shift, &_f_shift,
|
||||||
|
&this->_threads_per_atom);
|
||||||
|
}
|
||||||
|
this->time_pair.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
template class BornCoulWolf<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,282 @@
|
||||||
|
// **************************************************************************
|
||||||
|
// born_coul_wolf.cu
|
||||||
|
// -------------------
|
||||||
|
// Trung Dac Nguyen (ORNL)
|
||||||
|
//
|
||||||
|
// Device code for acceleration of the born/coul/wolf pair style
|
||||||
|
//
|
||||||
|
// __________________________________________________________________________
|
||||||
|
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
// __________________________________________________________________________
|
||||||
|
//
|
||||||
|
// begin :
|
||||||
|
// email : nguyentd@ornl.gov
|
||||||
|
// ***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef NV_KERNEL
|
||||||
|
|
||||||
|
#include "lal_aux_fun1.h"
|
||||||
|
#ifndef _DOUBLE_DOUBLE
|
||||||
|
texture<float4> pos_tex;
|
||||||
|
texture<float> q_tex;
|
||||||
|
#else
|
||||||
|
texture<int4,1> pos_tex;
|
||||||
|
texture<int2> q_tex;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define pos_tex x_
|
||||||
|
#define q_tex q_
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define MY_PIS (acctyp)1.77245385090551602729
|
||||||
|
|
||||||
|
__kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
|
||||||
|
__global numtyp4* coeff2, const int lj_types,
|
||||||
|
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||||
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
|
__global acctyp *engv, const int eflag,
|
||||||
|
const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, __global numtyp *q_,
|
||||||
|
__global numtyp4 *cutsq_sigma,
|
||||||
|
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||||
|
const numtyp alf, const numtyp e_shift,
|
||||||
|
const numtyp f_shift, const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp sp_lj[8];
|
||||||
|
sp_lj[0]=sp_lj_in[0];
|
||||||
|
sp_lj[1]=sp_lj_in[1];
|
||||||
|
sp_lj[2]=sp_lj_in[2];
|
||||||
|
sp_lj[3]=sp_lj_in[3];
|
||||||
|
sp_lj[4]=sp_lj_in[4];
|
||||||
|
sp_lj[5]=sp_lj_in[5];
|
||||||
|
sp_lj[6]=sp_lj_in[6];
|
||||||
|
sp_lj[7]=sp_lj_in[7];
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp e_coul=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||||
|
int itype=ix.w;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
|
||||||
|
e_coul += (acctyp)2.0*e_self;
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
int j=*nbor;
|
||||||
|
|
||||||
|
numtyp factor_lj, factor_coul;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
int jtype=jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
int mtype=itype*lj_types+jtype;
|
||||||
|
if (rsq<cutsq_sigma[mtype].x) { // cutsq
|
||||||
|
numtyp r2inv = ucl_recip(rsq);
|
||||||
|
numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
|
||||||
|
numtyp v_sh = (numtyp)0.0;
|
||||||
|
numtyp rexp = (numtyp)0.0;
|
||||||
|
|
||||||
|
if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
|
||||||
|
numtyp r = ucl_sqrt(rsq);
|
||||||
|
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
|
||||||
|
r6inv = r2inv*r2inv*r2inv;
|
||||||
|
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
|
||||||
|
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj;
|
||||||
|
} else forceborn = (numtyp)0.0;
|
||||||
|
|
||||||
|
if (rsq < cut_coulsq) {
|
||||||
|
numtyp r=ucl_rsqrt(r2inv);
|
||||||
|
numtyp arij = alf * r;
|
||||||
|
numtyp expm2 = ucl_exp(-arij*arij);
|
||||||
|
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*arij);
|
||||||
|
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||||
|
fetch(prefactor,j,q_tex);
|
||||||
|
prefactor *= qqrd2e * qtmp/r;
|
||||||
|
v_sh = (_erfc - e_shift*r)*prefactor;
|
||||||
|
numtyp dvdrr = (_erfc/rsq + EWALD_F*alf*expm2/r) + f_shift;
|
||||||
|
forcecoul = prefactor * (dvdrr*rsq-factor_coul);
|
||||||
|
} else forcecoul = (numtyp)0.0;
|
||||||
|
|
||||||
|
force = (forceborn + forcecoul) * r2inv;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
if (rsq < cut_coulsq)
|
||||||
|
e_coul += prefactor*(v_sh-factor_coul);
|
||||||
|
if (rsq < coeff1[mtype].w) {
|
||||||
|
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
|
||||||
|
+ coeff2[mtype].z*r2inv*r6inv;
|
||||||
|
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||||
|
vflag,ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
|
||||||
|
__global numtyp4* coeff2_in,
|
||||||
|
__global numtyp* sp_lj_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
|
const int eflag, const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, __global numtyp *q_,
|
||||||
|
__global numtyp4 *cutsq_sigma,
|
||||||
|
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||||
|
const numtyp alf, const numtyp e_shift,
|
||||||
|
const numtyp f_shift, const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp sp_lj[8];
|
||||||
|
if (tid<8)
|
||||||
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
|
coeff1[tid]=coeff1_in[tid];
|
||||||
|
if (eflag>0)
|
||||||
|
coeff2[tid]=coeff2_in[tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp e_coul=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||||
|
int iw=ix.w;
|
||||||
|
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
|
||||||
|
e_coul += (acctyp)2.0*e_self;
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
int j=*nbor;
|
||||||
|
|
||||||
|
numtyp factor_lj, factor_coul;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
int mtype=itype+jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
if (rsq<cutsq_sigma[mtype].x) {
|
||||||
|
numtyp r2inv=ucl_recip(rsq);
|
||||||
|
numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
|
||||||
|
numtyp v_sh = (numtyp)0.0;
|
||||||
|
numtyp rexp = (numtyp)0.0;
|
||||||
|
|
||||||
|
if (rsq < coeff1[mtype].w) {
|
||||||
|
numtyp r = ucl_sqrt(rsq);
|
||||||
|
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
|
||||||
|
r6inv = r2inv*r2inv*r2inv;
|
||||||
|
forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
|
||||||
|
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj;
|
||||||
|
} else forceborn = (numtyp)0.0;
|
||||||
|
|
||||||
|
if (rsq < cut_coulsq) {
|
||||||
|
numtyp r=ucl_rsqrt(r2inv);
|
||||||
|
numtyp arij = alf * r;
|
||||||
|
numtyp expm2 = ucl_exp(-arij*arij);
|
||||||
|
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*arij);
|
||||||
|
_erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
|
||||||
|
fetch(prefactor,j,q_tex);
|
||||||
|
prefactor *= qqrd2e * qtmp/r;
|
||||||
|
v_sh = (_erfc - e_shift*r)*prefactor;
|
||||||
|
numtyp dvdrr = (_erfc/rsq + EWALD_F*alf*expm2/r) + f_shift;
|
||||||
|
forcecoul = prefactor * (dvdrr*rsq-factor_coul);
|
||||||
|
} else forcecoul = (numtyp)0.0;
|
||||||
|
|
||||||
|
force = (forceborn + forcecoul) * r2inv;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
if (rsq < cut_coulsq)
|
||||||
|
e_coul += prefactor*(v_sh-factor_coul);
|
||||||
|
if (rsq < coeff1[mtype].w) {
|
||||||
|
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
|
||||||
|
+ coeff2[mtype].z*r2inv*r6inv;
|
||||||
|
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||||
|
vflag,ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,89 @@
|
||||||
|
/***************************************************************************
|
||||||
|
born_coul_wolf.h
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the born/coul/wolf pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef LAL_BORN_COUL_LONG_H
|
||||||
|
#define LAL_BORN_COUL_LONG_H
|
||||||
|
|
||||||
|
#include "lal_base_charge.h"
|
||||||
|
|
||||||
|
namespace LAMMPS_AL {
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
|
||||||
|
public:
|
||||||
|
BornCoulWolf();
|
||||||
|
~BornCoulWolf();
|
||||||
|
|
||||||
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
|
* \param cell_size cutoff + skin
|
||||||
|
* \param gpu_split fraction of particles handled by device
|
||||||
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
|
||||||
|
double **host_born1, double **host_born2, double **host_born3,
|
||||||
|
double **host_a, double **host_c, double **host_d,
|
||||||
|
double **host_sigma, double **host_offset, double *host_special_lj,
|
||||||
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||||
|
const double host_cut_coulsq, double *host_special_coul,
|
||||||
|
const double qqrd2e, const double alf, const double e_shift,
|
||||||
|
const double f_shift);
|
||||||
|
|
||||||
|
/// Clear all host and device data
|
||||||
|
/** \note This is called at the beginning of the init() routine **/
|
||||||
|
void clear();
|
||||||
|
|
||||||
|
/// Returns memory usage on device per atom
|
||||||
|
int bytes_per_atom(const int max_nbors) const;
|
||||||
|
|
||||||
|
/// Total host memory used by library for pair style
|
||||||
|
double host_memory_usage() const;
|
||||||
|
|
||||||
|
// --------------------------- TYPE DATA --------------------------
|
||||||
|
|
||||||
|
/// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2,
|
||||||
|
/// coeff1.w = born3
|
||||||
|
UCL_D_Vec<numtyp4> coeff1;
|
||||||
|
/// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset
|
||||||
|
UCL_D_Vec<numtyp4> coeff2;
|
||||||
|
/// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj,
|
||||||
|
/// cutsq_sigma.z = sigma
|
||||||
|
UCL_D_Vec<numtyp4> cutsq_sigma;
|
||||||
|
/// Special LJ values [0-3] and Special Coul values [4-7]
|
||||||
|
UCL_D_Vec<numtyp> sp_lj;
|
||||||
|
|
||||||
|
/// If atom type constants fit in shared memory, use fast kernels
|
||||||
|
bool shared_types;
|
||||||
|
|
||||||
|
/// Number of atom types
|
||||||
|
int _lj_types;
|
||||||
|
|
||||||
|
numtyp _cut_coulsq,_qqrd2e,_alf,_e_shift,_f_shift;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool _allocated;
|
||||||
|
void loop(const bool _eflag, const bool _vflag);
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,134 @@
|
||||||
|
/***************************************************************************
|
||||||
|
born_coul_wolf_ext.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Functions for LAMMPS access to born/coul/wolf acceleration routines.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <cassert>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#include "lal_born_coul_wolf.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
|
||||||
|
static BornCoulWolf<PRECISION,ACC_PRECISION> BORNCWMF;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Allocate memory on host and device and copy constants to device
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||||
|
double **host_born1, double **host_born2, double **host_born3,
|
||||||
|
double **host_a, double **host_c, double **host_d,
|
||||||
|
double **sigma, double **offset, double *special_lj, const int inum,
|
||||||
|
const int nall, const int max_nbors, const int maxspecial,
|
||||||
|
const double cell_size, int &gpu_mode, FILE *screen,
|
||||||
|
double **host_cut_ljsq, double host_cut_coulsq,
|
||||||
|
double *host_special_coul, const double qqrd2e,
|
||||||
|
const double alf, const double e_shift, const double f_shift) {
|
||||||
|
BORNCWMF.clear();
|
||||||
|
gpu_mode=BORNCWMF.device->gpu_mode();
|
||||||
|
double gpu_split=BORNCWMF.device->particle_split();
|
||||||
|
int first_gpu=BORNCWMF.device->first_device();
|
||||||
|
int last_gpu=BORNCWMF.device->last_device();
|
||||||
|
int world_me=BORNCWMF.device->world_me();
|
||||||
|
int gpu_rank=BORNCWMF.device->gpu_rank();
|
||||||
|
int procs_per_gpu=BORNCWMF.device->procs_per_gpu();
|
||||||
|
|
||||||
|
BORNCWMF.device->init_message(screen,"born/coul/wolf",first_gpu,last_gpu);
|
||||||
|
|
||||||
|
bool message=false;
|
||||||
|
if (BORNCWMF.device->replica_me()==0 && screen)
|
||||||
|
message=true;
|
||||||
|
|
||||||
|
if (message) {
|
||||||
|
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
|
||||||
|
int init_ok=0;
|
||||||
|
if (world_me==0)
|
||||||
|
init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||||
|
host_born3, host_a, host_c, host_d, sigma,
|
||||||
|
offset, special_lj, inum, nall, 300,
|
||||||
|
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||||
|
host_cut_coulsq, host_special_coul, qqrd2e,
|
||||||
|
alf, e_shift, f_shift);
|
||||||
|
|
||||||
|
BORNCWMF.device->world_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
|
if (message) {
|
||||||
|
if (last_gpu-first_gpu==0)
|
||||||
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
|
else
|
||||||
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
|
last_gpu,i);
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
if (gpu_rank==i && world_me!=0)
|
||||||
|
init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||||
|
host_born3, host_a, host_c, host_d, sigma,
|
||||||
|
offset, special_lj, inum, nall, 300,
|
||||||
|
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||||
|
host_cut_coulsq, host_special_coul, qqrd2e,
|
||||||
|
alf, e_shift, f_shift);
|
||||||
|
|
||||||
|
BORNCWMF.device->gpu_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
}
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"\n");
|
||||||
|
|
||||||
|
if (init_ok==0)
|
||||||
|
BORNCWMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
void borncw_gpu_clear() {
|
||||||
|
BORNCWMF.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
int** borncw_gpu_compute_n(const int ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type,
|
||||||
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
|
int **special, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
|
bool &success, double *host_q, double *boxlo,
|
||||||
|
double *prd) {
|
||||||
|
return BORNCWMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||||
|
host_q, boxlo, prd);
|
||||||
|
}
|
||||||
|
|
||||||
|
void borncw_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
const double cpu_time, bool &success, double *host_q,
|
||||||
|
const int nlocal, double *boxlo, double *prd) {
|
||||||
|
BORNCWMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||||
|
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
|
||||||
|
host_q,nlocal,boxlo,prd);
|
||||||
|
}
|
||||||
|
|
||||||
|
double borncw_gpu_bytes() {
|
||||||
|
return BORNCWMF.host_memory_usage();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,124 @@
|
||||||
|
/***************************************************************************
|
||||||
|
born_ext.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Functions for LAMMPS access to born acceleration routines.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <cassert>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#include "lal_born.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
|
||||||
|
static Born<PRECISION,ACC_PRECISION> BORNMF;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Allocate memory on host and device and copy constants to device
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||||
|
double **host_born1, double **host_born2,
|
||||||
|
double **host_born3, double **host_a, double **host_c,
|
||||||
|
double **host_d, double **sigma,
|
||||||
|
double **offset, double *special_lj, const int inum,
|
||||||
|
const int nall, const int max_nbors, const int maxspecial,
|
||||||
|
const double cell_size, int &gpu_mode, FILE *screen) {
|
||||||
|
BORNMF.clear();
|
||||||
|
gpu_mode=BORNMF.device->gpu_mode();
|
||||||
|
double gpu_split=BORNMF.device->particle_split();
|
||||||
|
int first_gpu=BORNMF.device->first_device();
|
||||||
|
int last_gpu=BORNMF.device->last_device();
|
||||||
|
int world_me=BORNMF.device->world_me();
|
||||||
|
int gpu_rank=BORNMF.device->gpu_rank();
|
||||||
|
int procs_per_gpu=BORNMF.device->procs_per_gpu();
|
||||||
|
|
||||||
|
BORNMF.device->init_message(screen,"born",first_gpu,last_gpu);
|
||||||
|
|
||||||
|
bool message=false;
|
||||||
|
if (BORNMF.device->replica_me()==0 && screen)
|
||||||
|
message=true;
|
||||||
|
|
||||||
|
if (message) {
|
||||||
|
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
|
||||||
|
int init_ok=0;
|
||||||
|
if (world_me==0)
|
||||||
|
init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||||
|
host_born3, host_a, host_c, host_d, sigma,
|
||||||
|
offset, special_lj, inum, nall, 300,
|
||||||
|
maxspecial, cell_size, gpu_split, screen);
|
||||||
|
|
||||||
|
BORNMF.device->world_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
|
if (message) {
|
||||||
|
if (last_gpu-first_gpu==0)
|
||||||
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
|
else
|
||||||
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
|
last_gpu,i);
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
if (gpu_rank==i && world_me!=0)
|
||||||
|
init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||||
|
host_born3, host_a, host_c, host_d, sigma,
|
||||||
|
offset, special_lj, inum, nall, 300,
|
||||||
|
maxspecial, cell_size, gpu_split, screen);
|
||||||
|
|
||||||
|
BORNMF.device->gpu_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
}
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"\n");
|
||||||
|
|
||||||
|
if (init_ok==0)
|
||||||
|
BORNMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
void born_gpu_clear() {
|
||||||
|
BORNMF.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
int ** born_gpu_compute_n(const int ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type,
|
||||||
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
|
int **special, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
|
bool &success) {
|
||||||
|
return BORNMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||||
|
}
|
||||||
|
|
||||||
|
void born_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
const double cpu_time, bool &success) {
|
||||||
|
BORNMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||||
|
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
||||||
|
}
|
||||||
|
|
||||||
|
double born_gpu_bytes() {
|
||||||
|
return BORNMF.host_memory_usage();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,181 @@
|
||||||
|
/***************************************************************************
|
||||||
|
colloid.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the colloid pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef USE_OPENCL
|
||||||
|
#include "colloid_cl.h"
|
||||||
|
#elif defined(USE_CUDART)
|
||||||
|
const char *colloid=0;
|
||||||
|
#else
|
||||||
|
#include "colloid_cubin.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "lal_colloid.h"
|
||||||
|
#include <cassert>
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
#define ColloidT Colloid<numtyp, acctyp>
|
||||||
|
|
||||||
|
extern Device<PRECISION,ACC_PRECISION> device;
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
ColloidT::Colloid() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
ColloidT::~Colloid() {
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int ColloidT::bytes_per_atom(const int max_nbors) const {
|
||||||
|
return this->bytes_per_atom_atomic(max_nbors);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int ColloidT::init(const int ntypes,
|
||||||
|
double **host_cutsq, double **host_lj1,
|
||||||
|
double **host_lj2, double **host_lj3,
|
||||||
|
double **host_lj4, double **host_offset,
|
||||||
|
double *host_special_lj, double **host_a12,
|
||||||
|
double **host_a1, double **host_a2,
|
||||||
|
double **host_d1, double **host_d2,
|
||||||
|
double **host_sigma3, double **host_sigma6,
|
||||||
|
int **host_form, const int nlocal,
|
||||||
|
const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *_screen) {
|
||||||
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
|
_screen,colloid,"k_colloid");
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
|
int lj_types=ntypes;
|
||||||
|
shared_types=false;
|
||||||
|
int max_shared_types=this->device->max_shared_types();
|
||||||
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
|
shared_types=true;
|
||||||
|
}
|
||||||
|
_lj_types=lj_types;
|
||||||
|
|
||||||
|
// Allocate a host write buffer for data initialization
|
||||||
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
|
UCL_WRITE_OPTIMIZED);
|
||||||
|
|
||||||
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
|
host_write[i]=0.0;
|
||||||
|
|
||||||
|
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||||
|
host_cutsq);
|
||||||
|
|
||||||
|
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||||
|
host_offset);
|
||||||
|
|
||||||
|
colloid1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,colloid1,host_write,host_a12,host_a1,
|
||||||
|
host_a2);
|
||||||
|
|
||||||
|
colloid2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,colloid2,host_write,host_d1,host_d2,
|
||||||
|
host_sigma3,host_sigma6);
|
||||||
|
|
||||||
|
UCL_H_Vec<int> dview_form(lj_types*lj_types,*(this->ucl_device),
|
||||||
|
UCL_WRITE_OPTIMIZED);
|
||||||
|
for (int i=0; i<lj_types*lj_types; i++) dview_form[i]=0;
|
||||||
|
|
||||||
|
form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
for (int i=0; i<ntypes; i++)
|
||||||
|
for (int j=0; j<ntypes; j++) {
|
||||||
|
dview_form[i*lj_types+j]=host_form[i][j];
|
||||||
|
}
|
||||||
|
ucl_copy(form,dview_form,false);
|
||||||
|
|
||||||
|
UCL_H_Vec<double> dview;
|
||||||
|
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
dview.view(host_special_lj,4,*(this->ucl_device));
|
||||||
|
ucl_copy(sp_lj,dview,false);
|
||||||
|
|
||||||
|
_allocated=true;
|
||||||
|
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()
|
||||||
|
+colloid1.row_bytes()+colloid2.row_bytes()+sp_lj.row_bytes();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void ColloidT::clear() {
|
||||||
|
if (!_allocated)
|
||||||
|
return;
|
||||||
|
_allocated=false;
|
||||||
|
|
||||||
|
lj1.clear();
|
||||||
|
lj3.clear();
|
||||||
|
colloid1.clear();
|
||||||
|
colloid2.clear();
|
||||||
|
form.clear();
|
||||||
|
sp_lj.clear();
|
||||||
|
this->clear_atomic();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
double ColloidT::host_memory_usage() const {
|
||||||
|
return this->host_memory_usage_atomic()+sizeof(Colloid<numtyp,acctyp>);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Calculate energies, forces, and torques
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void ColloidT::loop(const bool _eflag, const bool _vflag) {
|
||||||
|
// Compute the block size and grid size to keep all cores busy
|
||||||
|
const int BX=this->block_size();
|
||||||
|
int eflag, vflag;
|
||||||
|
if (_eflag)
|
||||||
|
eflag=1;
|
||||||
|
else
|
||||||
|
eflag=0;
|
||||||
|
|
||||||
|
if (_vflag)
|
||||||
|
vflag=1;
|
||||||
|
else
|
||||||
|
vflag=0;
|
||||||
|
|
||||||
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
|
int ainum=this->ans->inum();
|
||||||
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
|
this->time_pair.start();
|
||||||
|
if (shared_types) {
|
||||||
|
this->k_pair_fast.set_size(GX,BX);
|
||||||
|
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||||
|
&colloid1, &colloid2, &form,
|
||||||
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||||
|
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||||
|
} else {
|
||||||
|
this->k_pair.set_size(GX,BX);
|
||||||
|
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
|
||||||
|
&colloid1, &colloid2, &form,
|
||||||
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||||
|
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||||
|
}
|
||||||
|
this->time_pair.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
template class Colloid<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,329 @@
|
||||||
|
// **************************************************************************
|
||||||
|
// colloid.cu
|
||||||
|
// -------------------
|
||||||
|
// Trung Dac Nguyen (ORNL)
|
||||||
|
//
|
||||||
|
// Device code for acceleration of the colloid pair style
|
||||||
|
//
|
||||||
|
// __________________________________________________________________________
|
||||||
|
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
// __________________________________________________________________________
|
||||||
|
//
|
||||||
|
// begin :
|
||||||
|
// email : nguyentd@ornl.gov
|
||||||
|
// ***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef NV_KERNEL
|
||||||
|
#include "lal_aux_fun1.h"
|
||||||
|
#ifndef _DOUBLE_DOUBLE
|
||||||
|
texture<float4> pos_tex;
|
||||||
|
#else
|
||||||
|
texture<int4,1> pos_tex;
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define pos_tex x_
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||||
|
__global numtyp4* lj3, const int lj_types,
|
||||||
|
__global numtyp *sp_lj_in,
|
||||||
|
__global numtyp4* colloid1,
|
||||||
|
__global numtyp4* colloid2,
|
||||||
|
__global int *form,
|
||||||
|
__global int *dev_nbor,
|
||||||
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
|
__global acctyp *engv, const int eflag,
|
||||||
|
const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp sp_lj[4];
|
||||||
|
sp_lj[0]=sp_lj_in[0];
|
||||||
|
sp_lj[1]=sp_lj_in[1];
|
||||||
|
sp_lj[2]=sp_lj_in[2];
|
||||||
|
sp_lj[3]=sp_lj_in[3];
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
int itype=ix.w;
|
||||||
|
|
||||||
|
numtyp factor_lj;
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
|
int j=*nbor;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
int jtype=jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
int mtype=itype*lj_types+jtype;
|
||||||
|
if (rsq<lj1[mtype].z) {
|
||||||
|
numtyp r,r2inv,r6inv;
|
||||||
|
numtyp c1,c2,fR,evdwl;
|
||||||
|
numtyp K[9],h[4],g[4];
|
||||||
|
numtyp force = (numtyp)0;
|
||||||
|
|
||||||
|
if (form[mtype]==0) { // SMALL_SMALL
|
||||||
|
r2inv=ucl_recip(rsq);
|
||||||
|
r6inv = r2inv*r2inv*r2inv;
|
||||||
|
force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||||
|
force*=factor_lj;
|
||||||
|
} else if (form[mtype]==1) { // SMALL_LARGE
|
||||||
|
c2 = colloid1[mtype].z;
|
||||||
|
K[1] = c2*c2;
|
||||||
|
K[2] = rsq;
|
||||||
|
K[0] = K[1] - rsq;
|
||||||
|
K[4] = rsq*rsq;
|
||||||
|
K[3] = K[1] - K[2];
|
||||||
|
K[3] *= K[3]*K[3];
|
||||||
|
K[6] = K[3]*K[3];
|
||||||
|
fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
|
||||||
|
force = 4.0/15.0*fR *
|
||||||
|
(2.0*(K[1]+K[2]) * (K[1]*(5.0*K[1]+22.0*K[2])+5.0*K[4]) *
|
||||||
|
colloid2[mtype].w/K[6]-5.0) / K[0];
|
||||||
|
force*=factor_lj;
|
||||||
|
} else if (form[mtype]==2) { // LARGE_LARGE
|
||||||
|
r = ucl_sqrt(rsq);
|
||||||
|
c1 = colloid1[mtype].y;
|
||||||
|
c2 = colloid1[mtype].z;
|
||||||
|
K[0] = c1*c2;
|
||||||
|
K[1] = c1+c2;
|
||||||
|
K[2] = c1-c2;
|
||||||
|
K[3] = K[1]+r;
|
||||||
|
K[4] = K[1]-r;
|
||||||
|
K[5] = K[2]+r;
|
||||||
|
K[6] = K[2]-r;
|
||||||
|
K[7] = ucl_recip(K[3]*K[4]);
|
||||||
|
K[8] = ucl_recip(K[5]*K[6]);
|
||||||
|
g[0] = ucl_powr(K[3],(numtyp)-7.0);
|
||||||
|
g[1] = -ucl_powr(-K[4],(numtyp)-7.0);
|
||||||
|
g[2] = ucl_powr(K[5],(numtyp)-7.0);
|
||||||
|
g[3] = -ucl_powr(-K[6],(numtyp)-7.0);
|
||||||
|
h[0] = ((K[3]+(numtyp)5.0*K[1])*K[3]+(numtyp)30.0*K[0])*g[0];
|
||||||
|
h[1] = ((K[4]+(numtyp)5.0*K[1])*K[4]+(numtyp)30.0*K[0])*g[1];
|
||||||
|
h[2] = ((K[5]+(numtyp)5.0*K[2])*K[5]-(numtyp)30.0*K[0])*g[2];
|
||||||
|
h[3] = ((K[6]+(numtyp)5.0*K[2])*K[6]-(numtyp)30.0*K[0])*g[3];
|
||||||
|
g[0] *= (numtyp)42.0*K[0]/K[3]+(numtyp)6.0*K[1]+K[3];
|
||||||
|
g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4];
|
||||||
|
g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5];
|
||||||
|
g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6];
|
||||||
|
|
||||||
|
fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
|
||||||
|
evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
|
||||||
|
numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
|
||||||
|
numtyp dUA = -colloid1[mtype].x/3.0*r*(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
|
||||||
|
((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
|
||||||
|
force = factor_lj * (dUR+dUA)/r;
|
||||||
|
}
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
numtyp e=(numtyp)0.0;
|
||||||
|
if (form[mtype]==0) {
|
||||||
|
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||||
|
} else if (form[mtype]==1) {
|
||||||
|
e=(numtyp)2.0/(numtyp)9.0*fR *
|
||||||
|
((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) +
|
||||||
|
(numtyp)4.2*K[4])+K[2]*K[4]) * colloid2[mtype].w/K[6]);
|
||||||
|
} else if (form[mtype]==2) {
|
||||||
|
e=evdwl+colloid1[mtype].x/(numtyp)6.0 * ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
|
||||||
|
}
|
||||||
|
energy+=factor_lj*(e-lj3[mtype].z);
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||||
|
ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void k_colloid_fast(__global numtyp4 *x_,
|
||||||
|
__global numtyp4 *lj1_in,
|
||||||
|
__global numtyp4 *lj3_in,
|
||||||
|
__global numtyp *sp_lj_in,
|
||||||
|
__global numtyp4 *colloid1_in,
|
||||||
|
__global numtyp4 *colloid2_in,
|
||||||
|
__global int *form_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
|
const int eflag, const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp4 colloid1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp4 colloid2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp sp_lj[4];
|
||||||
|
if (tid<4)
|
||||||
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
|
lj1[tid]=lj1_in[tid];
|
||||||
|
colloid1[tid]=colloid1_in[tid];
|
||||||
|
colloid2[tid]=colloid2_in[tid];
|
||||||
|
form[tid]=form_in[tid];
|
||||||
|
if (eflag>0)
|
||||||
|
lj3[tid]=lj3_in[tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
int iw=ix.w;
|
||||||
|
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
|
numtyp factor_lj;
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
|
int j=*nbor;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
int mtype=itype+jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
if (rsq<lj1[mtype].z) {
|
||||||
|
numtyp r,r2inv,r6inv;
|
||||||
|
numtyp c1,c2,fR,evdwl;
|
||||||
|
numtyp K[9],h[4],g[4];
|
||||||
|
numtyp force = (numtyp)0;
|
||||||
|
|
||||||
|
if (form[mtype]==0) { // SMALL_SMALL
|
||||||
|
r2inv=ucl_recip(rsq);
|
||||||
|
r6inv = r2inv*r2inv*r2inv;
|
||||||
|
force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||||
|
force*=factor_lj;
|
||||||
|
} else if (form[mtype]==1) { // SMALL_LARGE
|
||||||
|
c2 = colloid1[mtype].z;
|
||||||
|
K[1] = c2*c2;
|
||||||
|
K[2] = rsq;
|
||||||
|
K[0] = K[1] - rsq;
|
||||||
|
K[4] = rsq*rsq;
|
||||||
|
K[3] = K[1] - K[2];
|
||||||
|
K[3] *= K[3]*K[3];
|
||||||
|
K[6] = K[3]*K[3];
|
||||||
|
fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
|
||||||
|
force = (numtyp)4.0/(numtyp)15.0*fR *
|
||||||
|
((numtyp)2.0*(K[1]+K[2]) * (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) *
|
||||||
|
colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
|
||||||
|
force*=factor_lj;
|
||||||
|
} else if (form[mtype]==2) { // LARGE_LARGE
|
||||||
|
r = ucl_sqrt(rsq);
|
||||||
|
c1 = colloid1[mtype].y;
|
||||||
|
c2 = colloid1[mtype].z;
|
||||||
|
K[0] = c1*c2;
|
||||||
|
K[1] = c1+c2;
|
||||||
|
K[2] = c1-c2;
|
||||||
|
K[3] = K[1]+r;
|
||||||
|
K[4] = K[1]-r;
|
||||||
|
K[5] = K[2]+r;
|
||||||
|
K[6] = K[2]-r;
|
||||||
|
K[7] = ucl_recip(K[3]*K[4]);
|
||||||
|
K[8] = ucl_recip(K[5]*K[6]);
|
||||||
|
g[0] = ucl_powr(K[3],(numtyp)-7.0);
|
||||||
|
g[1] = -ucl_powr(-K[4],(numtyp)-7.0);
|
||||||
|
g[2] = ucl_powr(K[5],(numtyp)-7.0);
|
||||||
|
g[3] = -ucl_powr(-K[6],(numtyp)-7.0);
|
||||||
|
h[0] = ((K[3]+(numtyp)5.0*K[1])*K[3]+(numtyp)30.0*K[0])*g[0];
|
||||||
|
h[1] = ((K[4]+(numtyp)5.0*K[1])*K[4]+(numtyp)30.0*K[0])*g[1];
|
||||||
|
h[2] = ((K[5]+(numtyp)5.0*K[2])*K[5]-(numtyp)30.0*K[0])*g[2];
|
||||||
|
h[3] = ((K[6]+(numtyp)5.0*K[2])*K[6]-(numtyp)30.0*K[0])*g[3];
|
||||||
|
g[0] *= (numtyp)42.0*K[0]/K[3]+(numtyp)6.0*K[1]+K[3];
|
||||||
|
g[1] *= (numtyp)42.0*K[0]/K[4]+(numtyp)6.0*K[1]+K[4];
|
||||||
|
g[2] *= (numtyp)-42.0*K[0]/K[5]+(numtyp)6.0*K[2]+K[5];
|
||||||
|
g[3] *= (numtyp)-42.0*K[0]/K[6]+(numtyp)6.0*K[2]+K[6];
|
||||||
|
|
||||||
|
fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
|
||||||
|
evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
|
||||||
|
numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
|
||||||
|
numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] +
|
||||||
|
((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
|
||||||
|
force = factor_lj * (dUR+dUA)/r;
|
||||||
|
} else force = (numtyp)0.0;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
numtyp e=(numtyp)0.0;
|
||||||
|
if (form[mtype]==0) {
|
||||||
|
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||||
|
} else if (form[mtype]==1) {
|
||||||
|
e=(numtyp)2.0/(numtyp)9.0*fR *
|
||||||
|
((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])*
|
||||||
|
colloid2[mtype].w/K[6]);
|
||||||
|
} else if (form[mtype]==2) {
|
||||||
|
e=evdwl+colloid1[mtype].x/(numtyp)6.0 * ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
|
||||||
|
}
|
||||||
|
energy+=factor_lj*(e-lj3[mtype].z);
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||||
|
ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,89 @@
|
||||||
|
/***************************************************************************
|
||||||
|
colloid.h
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the colloid pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef LAL_COLLOID_H
|
||||||
|
#define LAL_COLLOID_H
|
||||||
|
|
||||||
|
#include "lal_base_atomic.h"
|
||||||
|
|
||||||
|
namespace LAMMPS_AL {
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
class Colloid : public BaseAtomic<numtyp, acctyp> {
|
||||||
|
public:
|
||||||
|
Colloid();
|
||||||
|
~Colloid();
|
||||||
|
|
||||||
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
|
* \param cell_size cutoff + skin
|
||||||
|
* \param gpu_split fraction of particles handled by device
|
||||||
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double **host_cutsq,
|
||||||
|
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||||
|
double **host_lj4, double **host_offset, double *host_special_lj,
|
||||||
|
double **host_a12, double **host_a1, double **host_a2,
|
||||||
|
double **host_d1, double **host_d2, double **host_sigma3,
|
||||||
|
double **host_sigma6, int **host_form,
|
||||||
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *screen);
|
||||||
|
|
||||||
|
/// Clear all host and device data
|
||||||
|
/** \note This is called at the beginning of the init() routine **/
|
||||||
|
void clear();
|
||||||
|
|
||||||
|
/// Returns memory usage on device per atom
|
||||||
|
int bytes_per_atom(const int max_nbors) const;
|
||||||
|
|
||||||
|
/// Total host memory used by library for pair style
|
||||||
|
double host_memory_usage() const;
|
||||||
|
|
||||||
|
// --------------------------- TYPE DATA --------------------------
|
||||||
|
|
||||||
|
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq
|
||||||
|
UCL_D_Vec<numtyp4> lj1;
|
||||||
|
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
|
||||||
|
UCL_D_Vec<numtyp4> lj3;
|
||||||
|
/// colloid1.x = a12, colloid1.y = a1, colloid1.z = a2
|
||||||
|
UCL_D_Vec<numtyp4> colloid1;
|
||||||
|
/// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3,
|
||||||
|
/// colloid2.w = sigma6
|
||||||
|
UCL_D_Vec<numtyp4> colloid2;
|
||||||
|
/// form
|
||||||
|
UCL_D_Vec<int> form;
|
||||||
|
/// Special LJ values
|
||||||
|
UCL_D_Vec<numtyp> sp_lj;
|
||||||
|
|
||||||
|
/// If atom type constants fit in shared memory, use fast kernels
|
||||||
|
bool shared_types;
|
||||||
|
|
||||||
|
/// Number of atom types
|
||||||
|
int _lj_types;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool _allocated;
|
||||||
|
void loop(const bool _eflag, const bool _vflag);
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,127 @@
|
||||||
|
/***************************************************************************
|
||||||
|
colloid_ext.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Functions for LAMMPS access to colloid acceleration routines.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <cassert>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#include "lal_colloid.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
|
||||||
|
static Colloid<PRECISION,ACC_PRECISION> COLLMF;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Allocate memory on host and device and copy constants to device
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||||
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
|
double **offset, double *special_lj,
|
||||||
|
double **host_a12, double **host_a1, double **host_a2,
|
||||||
|
double **host_d1, double **host_d2, double **host_sigma3,
|
||||||
|
double **host_sigma6, int **host_form, const int inum,
|
||||||
|
const int nall, const int max_nbors, const int maxspecial,
|
||||||
|
const double cell_size, int &gpu_mode, FILE *screen) {
|
||||||
|
COLLMF.clear();
|
||||||
|
gpu_mode=COLLMF.device->gpu_mode();
|
||||||
|
double gpu_split=COLLMF.device->particle_split();
|
||||||
|
int first_gpu=COLLMF.device->first_device();
|
||||||
|
int last_gpu=COLLMF.device->last_device();
|
||||||
|
int world_me=COLLMF.device->world_me();
|
||||||
|
int gpu_rank=COLLMF.device->gpu_rank();
|
||||||
|
int procs_per_gpu=COLLMF.device->procs_per_gpu();
|
||||||
|
|
||||||
|
COLLMF.device->init_message(screen,"colloid",first_gpu,last_gpu);
|
||||||
|
|
||||||
|
bool message=false;
|
||||||
|
if (COLLMF.device->replica_me()==0 && screen)
|
||||||
|
message=true;
|
||||||
|
|
||||||
|
if (message) {
|
||||||
|
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
|
||||||
|
int init_ok=0;
|
||||||
|
if (world_me==0)
|
||||||
|
init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||||
|
host_lj4, offset, special_lj, host_a12, host_a1,
|
||||||
|
host_a2, host_d1, host_d2, host_sigma3,
|
||||||
|
host_sigma6, host_form, inum, nall, 300,
|
||||||
|
maxspecial, cell_size, gpu_split, screen);
|
||||||
|
|
||||||
|
COLLMF.device->world_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
|
if (message) {
|
||||||
|
if (last_gpu-first_gpu==0)
|
||||||
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
|
else
|
||||||
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
|
last_gpu,i);
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
if (gpu_rank==i && world_me!=0)
|
||||||
|
init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||||
|
offset, special_lj, host_a12, host_a1, host_a2,
|
||||||
|
host_d1, host_d2, host_sigma3, host_sigma6, host_form,
|
||||||
|
inum, nall, 300, maxspecial,
|
||||||
|
cell_size, gpu_split, screen);
|
||||||
|
|
||||||
|
COLLMF.device->gpu_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
}
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"\n");
|
||||||
|
|
||||||
|
if (init_ok==0)
|
||||||
|
COLLMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
void colloid_gpu_clear() {
|
||||||
|
COLLMF.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
int ** colloid_gpu_compute_n(const int ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type,
|
||||||
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
|
int **special, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
|
bool &success) {
|
||||||
|
return COLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||||
|
}
|
||||||
|
|
||||||
|
void colloid_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
const double cpu_time, bool &success) {
|
||||||
|
COLLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||||
|
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
||||||
|
}
|
||||||
|
|
||||||
|
double colloid_gpu_bytes() {
|
||||||
|
return COLLMF.host_memory_usage();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,153 @@
|
||||||
|
/***************************************************************************
|
||||||
|
coul_dsf.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the coul/dsf pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin : 8/15/2012
|
||||||
|
email : nguyentdw@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#if defined(USE_OPENCL)
|
||||||
|
#include "coul_dsf_cl.h"
|
||||||
|
#elif defined(USE_CUDART)
|
||||||
|
const char *coul_dsf=0;
|
||||||
|
#else
|
||||||
|
#include "coul_dsf_cubin.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "lal_coul_dsf.h"
|
||||||
|
#include <cassert>
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
#define CoulDSFT CoulDSF<numtyp, acctyp>
|
||||||
|
|
||||||
|
extern Device<PRECISION,ACC_PRECISION> device;
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
CoulDSFT::CoulDSF() : BaseCharge<numtyp,acctyp>(),
|
||||||
|
_allocated(false) {
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
CoulDSFT::~CoulDSF() {
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int CoulDSFT::bytes_per_atom(const int max_nbors) const {
|
||||||
|
return this->bytes_per_atom_atomic(max_nbors);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int CoulDSFT::init(const int ntypes, const int nlocal, const int nall,
|
||||||
|
const int max_nbors, const int maxspecial,
|
||||||
|
const double cell_size, const double gpu_split, FILE *_screen,
|
||||||
|
const double host_cut_coulsq, double *host_special_coul,
|
||||||
|
const double qqrd2e, const double e_shift, const double f_shift,
|
||||||
|
const double alpha) {
|
||||||
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
|
_screen,coul_dsf,"k_coul_dsf");
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
|
_cut_coulsq=host_cut_coulsq;
|
||||||
|
_e_shift=e_shift;
|
||||||
|
_f_shift=f_shift;
|
||||||
|
_alpha=alpha;
|
||||||
|
|
||||||
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
|
int lj_types=ntypes;
|
||||||
|
shared_types=false;
|
||||||
|
int max_shared_types=this->device->max_shared_types();
|
||||||
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
|
shared_types=true;
|
||||||
|
}
|
||||||
|
_lj_types=lj_types;
|
||||||
|
|
||||||
|
// Allocate a host write buffer for data initialization
|
||||||
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
|
UCL_WRITE_OPTIMIZED);
|
||||||
|
|
||||||
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
|
host_write[i]=0.0;
|
||||||
|
|
||||||
|
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
for (int i=0; i<4; i++) {
|
||||||
|
host_write[i]=host_special_coul[i];
|
||||||
|
}
|
||||||
|
ucl_copy(sp_lj,host_write,4,false);
|
||||||
|
|
||||||
|
_qqrd2e=qqrd2e;
|
||||||
|
|
||||||
|
_allocated=true;
|
||||||
|
this->_max_bytes=sp_lj.row_bytes();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void CoulDSFT::clear() {
|
||||||
|
if (!_allocated)
|
||||||
|
return;
|
||||||
|
_allocated=false;
|
||||||
|
|
||||||
|
sp_lj.clear();
|
||||||
|
this->clear_atomic();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
double CoulDSFT::host_memory_usage() const {
|
||||||
|
return this->host_memory_usage_atomic()+sizeof(CoulDSF<numtyp,acctyp>);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Calculate energies, forces, and torques
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
|
||||||
|
// Compute the block size and grid size to keep all cores busy
|
||||||
|
const int BX=this->block_size();
|
||||||
|
int eflag, vflag;
|
||||||
|
if (_eflag)
|
||||||
|
eflag=1;
|
||||||
|
else
|
||||||
|
eflag=0;
|
||||||
|
|
||||||
|
if (_vflag)
|
||||||
|
vflag=1;
|
||||||
|
else
|
||||||
|
vflag=0;
|
||||||
|
|
||||||
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
|
int ainum=this->ans->inum();
|
||||||
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
|
this->time_pair.start();
|
||||||
|
if (shared_types) {
|
||||||
|
this->k_pair_fast.set_size(GX,BX);
|
||||||
|
this->k_pair_fast.run(&this->atom->x, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv, &eflag,
|
||||||
|
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||||
|
&_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
|
||||||
|
&this->_threads_per_atom);
|
||||||
|
} else {
|
||||||
|
this->k_pair.set_size(GX,BX);
|
||||||
|
this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv,
|
||||||
|
&eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||||
|
&_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
|
||||||
|
&this->_threads_per_atom);
|
||||||
|
}
|
||||||
|
this->time_pair.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
template class CoulDSF<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,214 @@
|
||||||
|
// **************************************************************************
|
||||||
|
// coul_dsf.cu
|
||||||
|
// -------------------
|
||||||
|
// Trung Dac Nguyen (ORNL)
|
||||||
|
//
|
||||||
|
// Device code for acceleration of the coul/dsf pair style
|
||||||
|
//
|
||||||
|
// __________________________________________________________________________
|
||||||
|
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
// __________________________________________________________________________
|
||||||
|
//
|
||||||
|
// begin : 8/15/2012
|
||||||
|
// email : nguyentd@ornl.gov
|
||||||
|
// ***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef NV_KERNEL
|
||||||
|
|
||||||
|
#include "lal_aux_fun1.h"
|
||||||
|
#ifndef _DOUBLE_DOUBLE
|
||||||
|
texture<float4> pos_tex;
|
||||||
|
texture<float> q_tex;
|
||||||
|
#else
|
||||||
|
texture<int4,1> pos_tex;
|
||||||
|
texture<int2> q_tex;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define pos_tex x_
|
||||||
|
#define q_tex q_
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define MY_PIS (acctyp)1.77245385090551602729
|
||||||
|
|
||||||
|
__kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types,
|
||||||
|
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||||
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
|
__global acctyp *engv, const int eflag,
|
||||||
|
const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, __global numtyp *q_ ,
|
||||||
|
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||||
|
const numtyp e_shift, const numtyp f_shift,
|
||||||
|
const numtyp alpha, const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp sp_lj[4];
|
||||||
|
sp_lj[0]=sp_lj_in[0];
|
||||||
|
sp_lj[1]=sp_lj_in[1];
|
||||||
|
sp_lj[2]=sp_lj_in[2];
|
||||||
|
sp_lj[3]=sp_lj_in[3];
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp e_coul=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||||
|
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
int j=*nbor;
|
||||||
|
|
||||||
|
numtyp factor_coul, r, prefactor, erfcc;
|
||||||
|
factor_coul = sp_lj[sbmask(j)];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
if (rsq < cut_coulsq) {
|
||||||
|
numtyp r2inv=ucl_recip(rsq);
|
||||||
|
numtyp forcecoul, force;
|
||||||
|
|
||||||
|
r = ucl_sqrt(rsq);
|
||||||
|
fetch(prefactor,j,q_tex);
|
||||||
|
prefactor *= factor_coul * qqrd2e*qtmp/r;
|
||||||
|
numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
|
||||||
|
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
|
||||||
|
erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
|
||||||
|
forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd +
|
||||||
|
rsq*f_shift);
|
||||||
|
|
||||||
|
force = forcecoul * r2inv;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
if (rsq < cut_coulsq) {
|
||||||
|
numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift);
|
||||||
|
e_coul += e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||||
|
vflag,ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void k_coul_dsf_fast(__global numtyp4 *x_, __global numtyp* sp_lj_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
|
const int eflag, const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, __global numtyp *q_,
|
||||||
|
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||||
|
const numtyp e_shift, const numtyp f_shift,
|
||||||
|
const numtyp alpha, const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp sp_lj[4];
|
||||||
|
if (tid<4)
|
||||||
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp e_coul=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||||
|
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
int j=*nbor;
|
||||||
|
|
||||||
|
numtyp factor_coul, r, prefactor, erfcc;
|
||||||
|
factor_coul = sp_lj[sbmask(j)];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
if (rsq < cut_coulsq) {
|
||||||
|
numtyp r2inv=ucl_recip(rsq);
|
||||||
|
numtyp forcecoul, force;
|
||||||
|
|
||||||
|
r = ucl_sqrt(rsq);
|
||||||
|
fetch(prefactor,j,q_tex);
|
||||||
|
prefactor *= factor_coul * qqrd2e*qtmp/r;
|
||||||
|
numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
|
||||||
|
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
|
||||||
|
erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
|
||||||
|
forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd +
|
||||||
|
rsq*f_shift);
|
||||||
|
|
||||||
|
force = forcecoul * r2inv;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
if (rsq < cut_coulsq) {
|
||||||
|
numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift);
|
||||||
|
e_coul += e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||||
|
vflag,ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,78 @@
|
||||||
|
/***************************************************************************
|
||||||
|
coul_dsf.h
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the coul/dsf pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin : 8/15/2012
|
||||||
|
email : nguyentdw@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef LAL_LJ_DSF_H
|
||||||
|
#define LAL_LJ_DSF_H
|
||||||
|
|
||||||
|
#include "lal_base_charge.h"
|
||||||
|
|
||||||
|
namespace LAMMPS_AL {
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
class CoulDSF : public BaseCharge<numtyp, acctyp> {
|
||||||
|
public:
|
||||||
|
CoulDSF();
|
||||||
|
~CoulDSF();
|
||||||
|
|
||||||
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
|
* \param cell_size cutoff + skin
|
||||||
|
* \param gpu_split fraction of particles handled by device
|
||||||
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, const int nlocal, const int nall,
|
||||||
|
const int max_nbors, const int maxspecial,
|
||||||
|
const double cell_size, const double gpu_split, FILE *screen,
|
||||||
|
const double host_cut_coulsq, double *host_special_coul,
|
||||||
|
const double qqrd2e, const double e_shift, const double f_shift,
|
||||||
|
const double alpha);
|
||||||
|
|
||||||
|
/// Clear all host and device data
|
||||||
|
/** \note This is called at the beginning of the init() routine **/
|
||||||
|
void clear();
|
||||||
|
|
||||||
|
/// Returns memory usage on device per atom
|
||||||
|
int bytes_per_atom(const int max_nbors) const;
|
||||||
|
|
||||||
|
/// Total host memory used by library for pair style
|
||||||
|
double host_memory_usage() const;
|
||||||
|
|
||||||
|
// --------------------------- TYPE DATA --------------------------
|
||||||
|
|
||||||
|
/// Special LJ values [0-3] and Special Coul values [4-7]
|
||||||
|
UCL_D_Vec<numtyp> sp_lj;
|
||||||
|
|
||||||
|
/// If atom type constants fit in shared memory, use fast kernels
|
||||||
|
bool shared_types;
|
||||||
|
|
||||||
|
/// Number of atom types
|
||||||
|
int _lj_types;
|
||||||
|
|
||||||
|
numtyp _qqrd2e;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool _allocated;
|
||||||
|
numtyp _e_shift, _f_shift, _alpha, _cut_coulsq;
|
||||||
|
void loop(const bool _eflag, const bool _vflag);
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,125 @@
|
||||||
|
/***************************************************************************
|
||||||
|
coul_dsf_ext.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Functions for LAMMPS access to coul/dsf acceleration routines.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin : 8/15/2012
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <cassert>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#include "lal_coul_dsf.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
|
||||||
|
static CoulDSF<PRECISION,ACC_PRECISION> CDMF;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Allocate memory on host and device and copy constants to device
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
|
||||||
|
const int max_nbors, const int maxspecial,
|
||||||
|
const double cell_size, int &gpu_mode, FILE *screen,
|
||||||
|
const double host_cut_coulsq, double *host_special_coul,
|
||||||
|
const double qqrd2e, const double e_shift, const double f_shift,
|
||||||
|
const double alpha) {
|
||||||
|
CDMF.clear();
|
||||||
|
gpu_mode=CDMF.device->gpu_mode();
|
||||||
|
double gpu_split=CDMF.device->particle_split();
|
||||||
|
int first_gpu=CDMF.device->first_device();
|
||||||
|
int last_gpu=CDMF.device->last_device();
|
||||||
|
int world_me=CDMF.device->world_me();
|
||||||
|
int gpu_rank=CDMF.device->gpu_rank();
|
||||||
|
int procs_per_gpu=CDMF.device->procs_per_gpu();
|
||||||
|
|
||||||
|
CDMF.device->init_message(screen,"coul/dsf",first_gpu,last_gpu);
|
||||||
|
|
||||||
|
bool message=false;
|
||||||
|
if (CDMF.device->replica_me()==0 && screen)
|
||||||
|
message=true;
|
||||||
|
|
||||||
|
if (message) {
|
||||||
|
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
|
||||||
|
int init_ok=0;
|
||||||
|
if (world_me==0)
|
||||||
|
init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size,
|
||||||
|
gpu_split, screen, host_cut_coulsq, host_special_coul,
|
||||||
|
qqrd2e, e_shift, f_shift, alpha);
|
||||||
|
|
||||||
|
CDMF.device->world_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
|
if (message) {
|
||||||
|
if (last_gpu-first_gpu==0)
|
||||||
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
|
else
|
||||||
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
|
last_gpu,i);
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
if (gpu_rank==i && world_me!=0)
|
||||||
|
init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size,
|
||||||
|
gpu_split, screen, host_cut_coulsq, host_special_coul,
|
||||||
|
qqrd2e, e_shift, f_shift, alpha);
|
||||||
|
|
||||||
|
CDMF.device->gpu_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
}
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"\n");
|
||||||
|
|
||||||
|
if (init_ok==0)
|
||||||
|
CDMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
void cdsf_gpu_clear() {
|
||||||
|
CDMF.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
int** cdsf_gpu_compute_n(const int ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type,
|
||||||
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
|
int **special, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
|
bool &success, double *host_q, double *boxlo,
|
||||||
|
double *prd) {
|
||||||
|
return CDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||||
|
host_q, boxlo, prd);
|
||||||
|
}
|
||||||
|
|
||||||
|
void cdsf_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
const double cpu_time, bool &success, double *host_q,
|
||||||
|
const int nlocal, double *boxlo, double *prd) {
|
||||||
|
CDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
|
||||||
|
vflag,eatom,vatom,host_start,cpu_time,success,host_q,
|
||||||
|
nlocal,boxlo,prd);
|
||||||
|
}
|
||||||
|
|
||||||
|
double cdsf_gpu_bytes() {
|
||||||
|
return CDMF.host_memory_usage();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,170 @@
|
||||||
|
/***************************************************************************
|
||||||
|
dipole_lj.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the dipole/cut pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef USE_OPENCL
|
||||||
|
#include "dipole_lj_cl.h"
|
||||||
|
#elif defined(USE_CUDART)
|
||||||
|
const char *dipole_lj=0;
|
||||||
|
#else
|
||||||
|
#include "dipole_lj_cubin.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "lal_dipole_lj.h"
|
||||||
|
#include <cassert>
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
#define DipoleLJT DipoleLJ<numtyp, acctyp>
|
||||||
|
|
||||||
|
extern Device<PRECISION,ACC_PRECISION> device;
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
DipoleLJT::DipoleLJ() : BaseDipole<numtyp,acctyp>(),
|
||||||
|
_allocated(false) {
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
DipoleLJT::~DipoleLJ() {
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int DipoleLJT::bytes_per_atom(const int max_nbors) const {
|
||||||
|
return this->bytes_per_atom_atomic(max_nbors);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int DipoleLJT::init(const int ntypes,
|
||||||
|
double **host_cutsq, double **host_lj1,
|
||||||
|
double **host_lj2, double **host_lj3,
|
||||||
|
double **host_lj4, double **host_offset,
|
||||||
|
double *host_special_lj, const int nlocal,
|
||||||
|
const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *_screen,
|
||||||
|
double **host_cut_ljsq, double **host_cut_coulsq,
|
||||||
|
double *host_special_coul, const double qqrd2e) {
|
||||||
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
|
_screen,dipole_lj,"k_dipole_lj");
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
|
int lj_types=ntypes;
|
||||||
|
shared_types=false;
|
||||||
|
int max_shared_types=this->device->max_shared_types();
|
||||||
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
|
shared_types=true;
|
||||||
|
}
|
||||||
|
_lj_types=lj_types;
|
||||||
|
|
||||||
|
// Allocate a host write buffer for data initialization
|
||||||
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
|
UCL_WRITE_OPTIMIZED);
|
||||||
|
|
||||||
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
|
host_write[i]=0.0;
|
||||||
|
|
||||||
|
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||||
|
host_cut_ljsq, host_cut_coulsq);
|
||||||
|
|
||||||
|
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||||
|
host_offset);
|
||||||
|
|
||||||
|
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
|
||||||
|
|
||||||
|
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
for (int i=0; i<4; i++) {
|
||||||
|
host_write[i]=host_special_lj[i];
|
||||||
|
host_write[i+4]=host_special_coul[i];
|
||||||
|
}
|
||||||
|
ucl_copy(sp_lj,host_write,8,false);
|
||||||
|
|
||||||
|
_qqrd2e=qqrd2e;
|
||||||
|
|
||||||
|
_allocated=true;
|
||||||
|
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
|
||||||
|
sp_lj.row_bytes();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void DipoleLJT::clear() {
|
||||||
|
if (!_allocated)
|
||||||
|
return;
|
||||||
|
_allocated=false;
|
||||||
|
|
||||||
|
lj1.clear();
|
||||||
|
lj3.clear();
|
||||||
|
cutsq.clear();
|
||||||
|
sp_lj.clear();
|
||||||
|
this->clear_atomic();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
double DipoleLJT::host_memory_usage() const {
|
||||||
|
return this->host_memory_usage_atomic()+sizeof(DipoleLJ<numtyp,acctyp>);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Calculate energies, forces, and torques
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void DipoleLJT::loop(const bool _eflag, const bool _vflag) {
|
||||||
|
// Compute the block size and grid size to keep all cores busy
|
||||||
|
const int BX=this->block_size();
|
||||||
|
int eflag, vflag;
|
||||||
|
if (_eflag)
|
||||||
|
eflag=1;
|
||||||
|
else
|
||||||
|
eflag=0;
|
||||||
|
|
||||||
|
if (_vflag)
|
||||||
|
vflag=1;
|
||||||
|
else
|
||||||
|
vflag=0;
|
||||||
|
|
||||||
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
|
int ainum=this->ans->inum();
|
||||||
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
|
this->time_pair.start();
|
||||||
|
if (shared_types) {
|
||||||
|
this->k_pair_fast.set_size(GX,BX);
|
||||||
|
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor,
|
||||||
|
&this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||||
|
&ainum, &nbor_pitch, &this->atom->q,
|
||||||
|
&this->atom->quat, &cutsq,
|
||||||
|
&_qqrd2e, &this->_threads_per_atom);
|
||||||
|
} else {
|
||||||
|
this->k_pair.set_size(GX,BX);
|
||||||
|
this->k_pair.run(&this->atom->x, &lj1, &lj3,
|
||||||
|
&_lj_types, &sp_lj, &this->nbor->dev_nbor,
|
||||||
|
&this->_nbor_data->begin(), &this->ans->force,
|
||||||
|
&this->ans->engv, &eflag, &vflag, &ainum,
|
||||||
|
&nbor_pitch, &this->atom->q,
|
||||||
|
&this->atom->quat, &cutsq,
|
||||||
|
&_qqrd2e, &this->_threads_per_atom);
|
||||||
|
}
|
||||||
|
this->time_pair.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
template class DipoleLJ<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,501 @@
|
||||||
|
// **************************************************************************
|
||||||
|
// dipole_lj.cu
|
||||||
|
// -------------------
|
||||||
|
// Trung Dac Nguyen (ORNL)
|
||||||
|
//
|
||||||
|
// Device code for acceleration of the dipole/cut pair style
|
||||||
|
//
|
||||||
|
// __________________________________________________________________________
|
||||||
|
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
// __________________________________________________________________________
|
||||||
|
//
|
||||||
|
// begin :
|
||||||
|
// email : nguyentd@ornl.gov
|
||||||
|
// ***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef NV_KERNEL
|
||||||
|
#include "lal_aux_fun1.h"
|
||||||
|
|
||||||
|
#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid, \
|
||||||
|
t_per_atom, offset, eflag, vflag, ans, engv) \
|
||||||
|
if (t_per_atom>1) { \
|
||||||
|
__local acctyp red_acc[8][BLOCK_PAIR]; \
|
||||||
|
red_acc[0][tid]=f.x; \
|
||||||
|
red_acc[1][tid]=f.y; \
|
||||||
|
red_acc[2][tid]=f.z; \
|
||||||
|
red_acc[3][tid]=tor.x; \
|
||||||
|
red_acc[4][tid]=tor.y; \
|
||||||
|
red_acc[5][tid]=tor.z; \
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||||
|
if (offset < s) { \
|
||||||
|
for (int r=0; r<6; r++) \
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s]; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
f.x=red_acc[0][tid]; \
|
||||||
|
f.y=red_acc[1][tid]; \
|
||||||
|
f.z=red_acc[2][tid]; \
|
||||||
|
tor.x=red_acc[3][tid]; \
|
||||||
|
tor.y=red_acc[4][tid]; \
|
||||||
|
tor.z=red_acc[5][tid]; \
|
||||||
|
if (eflag>0 || vflag>0) { \
|
||||||
|
for (int r=0; r<6; r++) \
|
||||||
|
red_acc[r][tid]=virial[r]; \
|
||||||
|
red_acc[6][tid]=energy; \
|
||||||
|
red_acc[7][tid]=ecoul; \
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||||
|
if (offset < s) { \
|
||||||
|
for (int r=0; r<8; r++) \
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s]; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
for (int r=0; r<6; r++) \
|
||||||
|
virial[r]=red_acc[r][tid]; \
|
||||||
|
energy=red_acc[6][tid]; \
|
||||||
|
ecoul=red_acc[7][tid]; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
if (offset==0) { \
|
||||||
|
engv+=ii; \
|
||||||
|
if (eflag>0) { \
|
||||||
|
*engv=energy; \
|
||||||
|
engv+=inum; \
|
||||||
|
*engv=e_coul; \
|
||||||
|
engv+=inum; \
|
||||||
|
} \
|
||||||
|
if (vflag>0) { \
|
||||||
|
for (int i=0; i<6; i++) { \
|
||||||
|
*engv=virial[i]; \
|
||||||
|
engv+=inum; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
ans[ii]=f; \
|
||||||
|
ans[ii+inum]=tor; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef _DOUBLE_DOUBLE
|
||||||
|
texture<float4> pos_tex;
|
||||||
|
texture<float> q_tex;
|
||||||
|
texture<float4> mu_tex;
|
||||||
|
#else
|
||||||
|
texture<int4,1> pos_tex;
|
||||||
|
texture<int2> q_tex;
|
||||||
|
texture<int4,1> mu_tex;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define pos_tex x_
|
||||||
|
#define q_tex q_
|
||||||
|
#define mu_tex mu_
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__kernel void k_dipole_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||||
|
__global numtyp4* lj3, const int lj_types,
|
||||||
|
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||||
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
|
__global acctyp *engv, const int eflag,
|
||||||
|
const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, __global numtyp *q_ ,
|
||||||
|
__global numtyp4 *mu_,
|
||||||
|
__global numtyp *cutsq, const numtyp qqrd2e,
|
||||||
|
const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp sp_lj[8];
|
||||||
|
sp_lj[0]=sp_lj_in[0];
|
||||||
|
sp_lj[1]=sp_lj_in[1];
|
||||||
|
sp_lj[2]=sp_lj_in[2];
|
||||||
|
sp_lj[3]=sp_lj_in[3];
|
||||||
|
sp_lj[4]=sp_lj_in[4];
|
||||||
|
sp_lj[5]=sp_lj_in[5];
|
||||||
|
sp_lj[6]=sp_lj_in[6];
|
||||||
|
sp_lj[7]=sp_lj_in[7];
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp e_coul=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp4 tor;
|
||||||
|
tor.x=(acctyp)0;
|
||||||
|
tor.y=(acctyp)0;
|
||||||
|
tor.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||||
|
numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
|
||||||
|
int itype=ix.w;
|
||||||
|
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
int j=*nbor;
|
||||||
|
|
||||||
|
numtyp factor_lj, factor_coul;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
factor_coul = sp_lj[sbmask(j)+4];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
numtyp qj; fetch(qj,j,q_tex);
|
||||||
|
numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
|
||||||
|
int jtype=jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
int mtype=itype*lj_types+jtype;
|
||||||
|
if (rsq<cutsq[mtype]) {
|
||||||
|
numtyp r2inv=ucl_recip(rsq);
|
||||||
|
numtyp force_lj, r6inv;
|
||||||
|
numtyp rinv, r3inv, r5inv, r7inv;
|
||||||
|
numtyp pre1, pre2, pre3, pre4;
|
||||||
|
numtyp pdotp, pidotr, pjdotr;
|
||||||
|
acctyp4 forcecoul, ticoul;
|
||||||
|
acctyp4 force;
|
||||||
|
|
||||||
|
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
|
||||||
|
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].z) {
|
||||||
|
r6inv = r2inv*r2inv*r2inv;
|
||||||
|
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
|
||||||
|
} else force_lj = (numtyp)0.0;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].w) {
|
||||||
|
rinv = ucl_rsqrt(rsq);
|
||||||
|
|
||||||
|
// charge-charge
|
||||||
|
if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) {
|
||||||
|
r3inv = r2inv*rinv;
|
||||||
|
pre1 = qtmp*qj*r3inv;
|
||||||
|
|
||||||
|
forcecoul.x += pre1*delx;
|
||||||
|
forcecoul.y += pre1*dely;
|
||||||
|
forcecoul.z += pre1*delz;
|
||||||
|
}
|
||||||
|
|
||||||
|
// dipole-dipole
|
||||||
|
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
|
||||||
|
r3inv = r2inv*rinv;
|
||||||
|
r5inv = r3inv*r2inv;
|
||||||
|
r7inv = r5inv*r2inv;
|
||||||
|
pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
|
||||||
|
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
|
||||||
|
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
|
||||||
|
|
||||||
|
pre1 = (numtyp)3.0*r5inv*pdotp - (numtyp)15.0*r7inv*pidotr*pjdotr;
|
||||||
|
pre2 = (numtyp)3.0*r5inv*pjdotr;
|
||||||
|
pre3 = (numtyp)3.0*r5inv*pidotr;
|
||||||
|
pre4 = (numtyp)(-1.0)*r3inv;
|
||||||
|
|
||||||
|
forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x;
|
||||||
|
forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y;
|
||||||
|
forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z;
|
||||||
|
|
||||||
|
numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
|
||||||
|
numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
|
||||||
|
numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
|
||||||
|
|
||||||
|
ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
|
||||||
|
ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
|
||||||
|
ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// dipole-charge
|
||||||
|
if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
|
||||||
|
r3inv = r2inv*rinv;
|
||||||
|
r5inv = r3inv*r2inv;
|
||||||
|
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
|
||||||
|
pre1 = (numtyp)3.0*qj*r5inv * pidotr;
|
||||||
|
pre2 = qj*r3inv;
|
||||||
|
|
||||||
|
forcecoul.x += pre2*mui.x - pre1*delx;
|
||||||
|
forcecoul.y += pre2*mui.y - pre1*dely;
|
||||||
|
forcecoul.z += pre2*mui.z - pre1*delz;
|
||||||
|
ticoul.x += pre2 * (mui.y*delz - mui.z*dely);
|
||||||
|
ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
|
||||||
|
ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// charge-dipole
|
||||||
|
if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
|
||||||
|
r3inv = r2inv*rinv;
|
||||||
|
r5inv = r3inv*r2inv;
|
||||||
|
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
|
||||||
|
pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr;
|
||||||
|
pre2 = qtmp*r3inv;
|
||||||
|
|
||||||
|
forcecoul.x += pre1*delx - pre2*muj.x;
|
||||||
|
forcecoul.y += pre1*dely - pre2*muj.y;
|
||||||
|
forcecoul.z += pre1*delz - pre2*muj.z;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
|
||||||
|
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
|
||||||
|
}
|
||||||
|
|
||||||
|
numtyp fq = factor_coul*qqrd2e;
|
||||||
|
force.x = fq*forcecoul.x + delx*force_lj;
|
||||||
|
force.y = fq*forcecoul.y + dely*force_lj;
|
||||||
|
force.z = fq*forcecoul.z + delz*force_lj;
|
||||||
|
f.x+=force.x;
|
||||||
|
f.y+=force.y;
|
||||||
|
f.z+=force.z;
|
||||||
|
tor.x+=fq*ticoul.x;
|
||||||
|
tor.y+=fq*ticoul.y;
|
||||||
|
tor.z+=fq*ticoul.z;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
acctyp e = (acctyp)0.0;
|
||||||
|
if (rsq < lj1[mtype].w) {
|
||||||
|
e = qtmp*qj*rinv;
|
||||||
|
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
|
||||||
|
e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr;
|
||||||
|
if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0)
|
||||||
|
e += -qj*r3inv*pidotr;
|
||||||
|
if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
|
||||||
|
e += qtmp*r3inv*pjdotr;
|
||||||
|
e *= fq;
|
||||||
|
} else e = (acctyp)0.0;
|
||||||
|
e_coul += e;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].z) {
|
||||||
|
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||||
|
energy+=factor_lj*(e-lj3[mtype].z);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*force.x;
|
||||||
|
virial[1] += dely*force.y;
|
||||||
|
virial[2] += delz*force.z;
|
||||||
|
virial[3] += delx*force.y;
|
||||||
|
virial[4] += delx*force.z;
|
||||||
|
virial[5] += dely*force.z;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||||
|
vflag,ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void k_dipole_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||||
|
__global numtyp4* lj3_in,
|
||||||
|
__global numtyp* sp_lj_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
|
const int eflag, const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, __global numtyp *q_,
|
||||||
|
__global numtyp4 *mu_,
|
||||||
|
__global numtyp *_cutsq, const numtyp qqrd2e,
|
||||||
|
const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp sp_lj[8];
|
||||||
|
if (tid<8)
|
||||||
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
|
lj1[tid]=lj1_in[tid];
|
||||||
|
cutsq[tid]=_cutsq[tid];
|
||||||
|
if (eflag>0)
|
||||||
|
lj3[tid]=lj3_in[tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp e_coul=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp4 tor;
|
||||||
|
tor.x=(acctyp)0;
|
||||||
|
tor.y=(acctyp)0;
|
||||||
|
tor.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||||
|
numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
|
||||||
|
int iw=ix.w;
|
||||||
|
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
int j=*nbor;
|
||||||
|
|
||||||
|
numtyp factor_lj, factor_coul;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
factor_coul = sp_lj[sbmask(j)+4];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
numtyp qj; fetch(qj,j,q_tex);
|
||||||
|
numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
|
||||||
|
int mtype=itype+jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
if (rsq<cutsq[mtype]) {
|
||||||
|
numtyp r2inv=ucl_recip(rsq);
|
||||||
|
numtyp force_lj, r6inv;
|
||||||
|
numtyp rinv, r3inv, r5inv, r7inv;
|
||||||
|
numtyp pre1, pre2, pre3, pre4;
|
||||||
|
numtyp pdotp, pidotr, pjdotr;
|
||||||
|
acctyp4 forcecoul, ticoul;
|
||||||
|
acctyp4 force;
|
||||||
|
|
||||||
|
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
|
||||||
|
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].z) {
|
||||||
|
r6inv = r2inv*r2inv*r2inv;
|
||||||
|
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
|
||||||
|
} else force_lj = (numtyp)0.0;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].w) {
|
||||||
|
rinv = ucl_rsqrt(rsq);
|
||||||
|
|
||||||
|
// charge-charge
|
||||||
|
if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) {
|
||||||
|
r3inv = r2inv*rinv;
|
||||||
|
pre1 = qtmp*qj*r3inv;
|
||||||
|
|
||||||
|
forcecoul.x += pre1*delx;
|
||||||
|
forcecoul.y += pre1*dely;
|
||||||
|
forcecoul.z += pre1*delz;
|
||||||
|
}
|
||||||
|
|
||||||
|
// dipole-dipole
|
||||||
|
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
|
||||||
|
r3inv = r2inv*rinv;
|
||||||
|
r5inv = r3inv*r2inv;
|
||||||
|
r7inv = r5inv*r2inv;
|
||||||
|
pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
|
||||||
|
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
|
||||||
|
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
|
||||||
|
|
||||||
|
pre1 = (numtyp)3.0*r5inv*pdotp - (numtyp)15.0*r7inv*pidotr*pjdotr;
|
||||||
|
pre2 = (numtyp)3.0*r5inv*pjdotr;
|
||||||
|
pre3 = (numtyp)3.0*r5inv*pidotr;
|
||||||
|
pre4 = (numtyp)(-1.0)*r3inv;
|
||||||
|
|
||||||
|
forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x;
|
||||||
|
forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y;
|
||||||
|
forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z;
|
||||||
|
|
||||||
|
numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
|
||||||
|
numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
|
||||||
|
numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
|
||||||
|
|
||||||
|
ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
|
||||||
|
ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
|
||||||
|
ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// dipole-charge
|
||||||
|
if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
|
||||||
|
r3inv = r2inv*rinv;
|
||||||
|
r5inv = r3inv*r2inv;
|
||||||
|
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
|
||||||
|
pre1 = (numtyp)3.0*qj*r5inv * pidotr;
|
||||||
|
pre2 = qj*r3inv;
|
||||||
|
|
||||||
|
forcecoul.x += pre2*mui.x - pre1*delx;
|
||||||
|
forcecoul.y += pre2*mui.y - pre1*dely;
|
||||||
|
forcecoul.z += pre2*mui.z - pre1*delz;
|
||||||
|
ticoul.x += pre2 * (mui.y*delz - mui.z*dely);
|
||||||
|
ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
|
||||||
|
ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// charge-dipole
|
||||||
|
if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
|
||||||
|
r3inv = r2inv*rinv;
|
||||||
|
r5inv = r3inv*r2inv;
|
||||||
|
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
|
||||||
|
pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr;
|
||||||
|
pre2 = qtmp*r3inv;
|
||||||
|
|
||||||
|
forcecoul.x += pre1*delx - pre2*muj.x;
|
||||||
|
forcecoul.y += pre1*dely - pre2*muj.y;
|
||||||
|
forcecoul.z += pre1*delz - pre2*muj.z;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
|
||||||
|
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
|
||||||
|
}
|
||||||
|
|
||||||
|
numtyp fq = factor_coul*qqrd2e;
|
||||||
|
force.x = fq*forcecoul.x + delx*force_lj;
|
||||||
|
force.y = fq*forcecoul.y + dely*force_lj;
|
||||||
|
force.z = fq*forcecoul.z + delz*force_lj;
|
||||||
|
|
||||||
|
f.x+=force.x;
|
||||||
|
f.y+=force.y;
|
||||||
|
f.z+=force.z;
|
||||||
|
tor.x+=fq*ticoul.x;
|
||||||
|
tor.y+=fq*ticoul.y;
|
||||||
|
tor.z+=fq*ticoul.z;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
acctyp e = (acctyp)0;
|
||||||
|
if (rsq < lj1[mtype].w) {
|
||||||
|
e = qtmp*qj*rinv;
|
||||||
|
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
|
||||||
|
e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr;
|
||||||
|
if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0)
|
||||||
|
e += -qj*r3inv*pidotr;
|
||||||
|
if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
|
||||||
|
e += qtmp*r3inv*pjdotr;
|
||||||
|
e *= fq;
|
||||||
|
} else e = (acctyp)0;
|
||||||
|
e_coul += e;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].z) {
|
||||||
|
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||||
|
energy+=factor_lj*(e-lj3[mtype].z);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*force.x;
|
||||||
|
virial[1] += dely*force.y;
|
||||||
|
virial[2] += delz*force.z;
|
||||||
|
virial[3] += delx*force.y;
|
||||||
|
virial[4] += delx*force.z;
|
||||||
|
virial[5] += dely*force.z;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||||
|
vflag,ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,85 @@
|
||||||
|
/***************************************************************************
|
||||||
|
dipole_lj.h
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the dipole/cut pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef LAL_DIPOLE_LJ_H
|
||||||
|
#define LAL_DIPOLE_LJ_H
|
||||||
|
|
||||||
|
#include "lal_base_dipole.h"
|
||||||
|
|
||||||
|
namespace LAMMPS_AL {
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
class DipoleLJ : public BaseDipole<numtyp, acctyp> {
|
||||||
|
public:
|
||||||
|
DipoleLJ();
|
||||||
|
~DipoleLJ();
|
||||||
|
|
||||||
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
|
* \param cell_size cutoff + skin
|
||||||
|
* \param gpu_split fraction of particles handled by device
|
||||||
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double **host_cutsq, double **host_lj1,
|
||||||
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
|
double **host_offset, double *host_special_lj,
|
||||||
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||||
|
double **host_cut_coulsq, double *host_special_coul,
|
||||||
|
const double qqrd2e);
|
||||||
|
|
||||||
|
/// Clear all host and device data
|
||||||
|
/** \note This is called at the beginning of the init() routine **/
|
||||||
|
void clear();
|
||||||
|
|
||||||
|
/// Returns memory usage on device per atom
|
||||||
|
int bytes_per_atom(const int max_nbors) const;
|
||||||
|
|
||||||
|
/// Total host memory used by library for pair style
|
||||||
|
double host_memory_usage() const;
|
||||||
|
|
||||||
|
// --------------------------- TYPE DATA --------------------------
|
||||||
|
|
||||||
|
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul
|
||||||
|
UCL_D_Vec<numtyp4> lj1;
|
||||||
|
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
|
||||||
|
UCL_D_Vec<numtyp4> lj3;
|
||||||
|
/// cutsq
|
||||||
|
UCL_D_Vec<numtyp> cutsq;
|
||||||
|
/// Special LJ values [0-3] and Special Coul values [4-7]
|
||||||
|
UCL_D_Vec<numtyp> sp_lj;
|
||||||
|
|
||||||
|
/// If atom type constants fit in shared memory, use fast kernels
|
||||||
|
bool shared_types;
|
||||||
|
|
||||||
|
/// Number of atom types
|
||||||
|
int _lj_types;
|
||||||
|
|
||||||
|
numtyp _qqrd2e;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool _allocated;
|
||||||
|
void loop(const bool _eflag, const bool _vflag);
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,128 @@
|
||||||
|
/***************************************************************************
|
||||||
|
dipole_lj_ext.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Functions for LAMMPS access to dipole/cut acceleration routines.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <cassert>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#include "lal_dipole_lj.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
|
||||||
|
static DipoleLJ<PRECISION,ACC_PRECISION> DPLMF;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Allocate memory on host and device and copy constants to device
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||||
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
|
double **offset, double *special_lj, const int inum,
|
||||||
|
const int nall, const int max_nbors, const int maxspecial,
|
||||||
|
const double cell_size, int &gpu_mode, FILE *screen,
|
||||||
|
double **host_cut_ljsq, double **host_cut_coulsq,
|
||||||
|
double *host_special_coul, const double qqrd2e) {
|
||||||
|
DPLMF.clear();
|
||||||
|
gpu_mode=DPLMF.device->gpu_mode();
|
||||||
|
double gpu_split=DPLMF.device->particle_split();
|
||||||
|
int first_gpu=DPLMF.device->first_device();
|
||||||
|
int last_gpu=DPLMF.device->last_device();
|
||||||
|
int world_me=DPLMF.device->world_me();
|
||||||
|
int gpu_rank=DPLMF.device->gpu_rank();
|
||||||
|
int procs_per_gpu=DPLMF.device->procs_per_gpu();
|
||||||
|
|
||||||
|
DPLMF.device->init_message(screen,"dipole/cut",first_gpu,last_gpu);
|
||||||
|
|
||||||
|
bool message=false;
|
||||||
|
if (DPLMF.device->replica_me()==0 && screen)
|
||||||
|
message=true;
|
||||||
|
|
||||||
|
if (message) {
|
||||||
|
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
|
||||||
|
int init_ok=0;
|
||||||
|
if (world_me==0)
|
||||||
|
init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||||
|
host_lj4, offset, special_lj, inum, nall, 300,
|
||||||
|
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||||
|
host_cut_coulsq, host_special_coul, qqrd2e);
|
||||||
|
|
||||||
|
DPLMF.device->world_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
|
if (message) {
|
||||||
|
if (last_gpu-first_gpu==0)
|
||||||
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
|
else
|
||||||
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
|
last_gpu,i);
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
if (gpu_rank==i && world_me!=0)
|
||||||
|
init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||||
|
offset, special_lj, inum, nall, 300, maxspecial,
|
||||||
|
cell_size, gpu_split, screen, host_cut_ljsq,
|
||||||
|
host_cut_coulsq, host_special_coul, qqrd2e);
|
||||||
|
|
||||||
|
DPLMF.device->gpu_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
}
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"\n");
|
||||||
|
|
||||||
|
if (init_ok==0)
|
||||||
|
DPLMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
void dpl_gpu_clear() {
|
||||||
|
DPLMF.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
int** dpl_gpu_compute_n(const int ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type,
|
||||||
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
|
int **special, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
|
bool &success, double *host_q, double **host_mu,
|
||||||
|
double *boxlo, double *prd) {
|
||||||
|
return DPLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||||
|
host_q, host_mu, boxlo, prd);
|
||||||
|
}
|
||||||
|
|
||||||
|
void dpl_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
const double cpu_time, bool &success, double *host_q,
|
||||||
|
double **host_mu, const int nlocal, double *boxlo, double *prd) {
|
||||||
|
DPLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
|
||||||
|
vflag,eatom,vatom,host_start,cpu_time,success,host_q,host_mu,
|
||||||
|
nlocal,boxlo,prd);
|
||||||
|
}
|
||||||
|
|
||||||
|
double dpl_gpu_bytes() {
|
||||||
|
return DPLMF.host_memory_usage();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,170 @@
|
||||||
|
/***************************************************************************
|
||||||
|
dipole_lj_sf.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the dipole/sf pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef USE_OPENCL
|
||||||
|
#include "dipole_lj_sf_cl.h"
|
||||||
|
#elif defined(USE_CUDART)
|
||||||
|
const char *dipole_lj_sf=0;
|
||||||
|
#else
|
||||||
|
#include "dipole_lj_sf_cubin.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "lal_dipole_lj_sf.h"
|
||||||
|
#include <cassert>
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
#define DipoleLJSFT DipoleLJSF<numtyp, acctyp>
|
||||||
|
|
||||||
|
extern Device<PRECISION,ACC_PRECISION> device;
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
DipoleLJSFT::DipoleLJSF() : BaseDipole<numtyp,acctyp>(),
|
||||||
|
_allocated(false) {
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
DipoleLJSFT::~DipoleLJSF() {
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int DipoleLJSFT::bytes_per_atom(const int max_nbors) const {
|
||||||
|
return this->bytes_per_atom_atomic(max_nbors);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int DipoleLJSFT::init(const int ntypes,
|
||||||
|
double **host_cutsq, double **host_lj1,
|
||||||
|
double **host_lj2, double **host_lj3,
|
||||||
|
double **host_lj4,
|
||||||
|
double *host_special_lj, const int nlocal,
|
||||||
|
const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *_screen,
|
||||||
|
double **host_cut_ljsq, double **host_cut_coulsq,
|
||||||
|
double *host_special_coul, const double qqrd2e) {
|
||||||
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
|
_screen,dipole_lj_sf,"k_dipole_lj_sf");
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
|
int lj_types=ntypes;
|
||||||
|
shared_types=false;
|
||||||
|
int max_shared_types=this->device->max_shared_types();
|
||||||
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
|
shared_types=true;
|
||||||
|
}
|
||||||
|
_lj_types=lj_types;
|
||||||
|
|
||||||
|
// Allocate a host write buffer for data initialization
|
||||||
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
|
UCL_WRITE_OPTIMIZED);
|
||||||
|
|
||||||
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
|
host_write[i]=0.0;
|
||||||
|
|
||||||
|
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||||
|
host_cut_ljsq, host_cut_coulsq);
|
||||||
|
|
||||||
|
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||||
|
host_cutsq);
|
||||||
|
|
||||||
|
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
|
||||||
|
|
||||||
|
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
for (int i=0; i<4; i++) {
|
||||||
|
host_write[i]=host_special_lj[i];
|
||||||
|
host_write[i+4]=host_special_coul[i];
|
||||||
|
}
|
||||||
|
ucl_copy(sp_lj,host_write,8,false);
|
||||||
|
|
||||||
|
_qqrd2e=qqrd2e;
|
||||||
|
|
||||||
|
_allocated=true;
|
||||||
|
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
|
||||||
|
sp_lj.row_bytes();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void DipoleLJSFT::clear() {
|
||||||
|
if (!_allocated)
|
||||||
|
return;
|
||||||
|
_allocated=false;
|
||||||
|
|
||||||
|
lj1.clear();
|
||||||
|
lj3.clear();
|
||||||
|
cutsq.clear();
|
||||||
|
sp_lj.clear();
|
||||||
|
this->clear_atomic();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
double DipoleLJSFT::host_memory_usage() const {
|
||||||
|
return this->host_memory_usage_atomic()+sizeof(DipoleLJSF<numtyp,acctyp>);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Calculate energies, forces, and torques
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) {
|
||||||
|
// Compute the block size and grid size to keep all cores busy
|
||||||
|
const int BX=this->block_size();
|
||||||
|
int eflag, vflag;
|
||||||
|
if (_eflag)
|
||||||
|
eflag=1;
|
||||||
|
else
|
||||||
|
eflag=0;
|
||||||
|
|
||||||
|
if (_vflag)
|
||||||
|
vflag=1;
|
||||||
|
else
|
||||||
|
vflag=0;
|
||||||
|
|
||||||
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
|
int ainum=this->ans->inum();
|
||||||
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
|
this->time_pair.start();
|
||||||
|
if (shared_types) {
|
||||||
|
this->k_pair_fast.set_size(GX,BX);
|
||||||
|
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor,
|
||||||
|
&this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||||
|
&ainum, &nbor_pitch, &this->atom->q,
|
||||||
|
&this->atom->quat, &cutsq,
|
||||||
|
&_qqrd2e, &this->_threads_per_atom);
|
||||||
|
} else {
|
||||||
|
this->k_pair.set_size(GX,BX);
|
||||||
|
this->k_pair.run(&this->atom->x, &lj1, &lj3,
|
||||||
|
&_lj_types, &sp_lj, &this->nbor->dev_nbor,
|
||||||
|
&this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||||
|
&ainum, &nbor_pitch, &this->atom->q,
|
||||||
|
&this->atom->quat, &cutsq,
|
||||||
|
&_qqrd2e, &this->_threads_per_atom);
|
||||||
|
}
|
||||||
|
this->time_pair.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
template class DipoleLJSF<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,562 @@
|
||||||
|
// **************************************************************************
|
||||||
|
// dipole_lj_sf.cu
|
||||||
|
// -------------------
|
||||||
|
// Trung Dac Nguyen (ORNL)
|
||||||
|
//
|
||||||
|
// Device code for acceleration of the dipole/sf pair style
|
||||||
|
//
|
||||||
|
// __________________________________________________________________________
|
||||||
|
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
// __________________________________________________________________________
|
||||||
|
//
|
||||||
|
// begin :
|
||||||
|
// email : nguyentd@ornl.gov
|
||||||
|
// ***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef NV_KERNEL
|
||||||
|
#include "lal_aux_fun1.h"
|
||||||
|
|
||||||
|
#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid, \
|
||||||
|
t_per_atom, offset, eflag, vflag, ans, engv) \
|
||||||
|
if (t_per_atom>1) { \
|
||||||
|
__local acctyp red_acc[8][BLOCK_PAIR]; \
|
||||||
|
red_acc[0][tid]=f.x; \
|
||||||
|
red_acc[1][tid]=f.y; \
|
||||||
|
red_acc[2][tid]=f.z; \
|
||||||
|
red_acc[3][tid]=tor.x; \
|
||||||
|
red_acc[4][tid]=tor.y; \
|
||||||
|
red_acc[5][tid]=tor.z; \
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||||
|
if (offset < s) { \
|
||||||
|
for (int r=0; r<6; r++) \
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s]; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
f.x=red_acc[0][tid]; \
|
||||||
|
f.y=red_acc[1][tid]; \
|
||||||
|
f.z=red_acc[2][tid]; \
|
||||||
|
tor.x=red_acc[3][tid]; \
|
||||||
|
tor.y=red_acc[4][tid]; \
|
||||||
|
tor.z=red_acc[5][tid]; \
|
||||||
|
if (eflag>0 || vflag>0) { \
|
||||||
|
for (int r=0; r<6; r++) \
|
||||||
|
red_acc[r][tid]=virial[r]; \
|
||||||
|
red_acc[6][tid]=energy; \
|
||||||
|
red_acc[7][tid]=ecoul; \
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||||
|
if (offset < s) { \
|
||||||
|
for (int r=0; r<8; r++) \
|
||||||
|
red_acc[r][tid] += red_acc[r][tid+s]; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
for (int r=0; r<6; r++) \
|
||||||
|
virial[r]=red_acc[r][tid]; \
|
||||||
|
energy=red_acc[6][tid]; \
|
||||||
|
ecoul=red_acc[7][tid]; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
if (offset==0) { \
|
||||||
|
engv+=ii; \
|
||||||
|
if (eflag>0) { \
|
||||||
|
*engv=energy; \
|
||||||
|
engv+=inum; \
|
||||||
|
*engv=e_coul; \
|
||||||
|
engv+=inum; \
|
||||||
|
} \
|
||||||
|
if (vflag>0) { \
|
||||||
|
for (int i=0; i<6; i++) { \
|
||||||
|
*engv=virial[i]; \
|
||||||
|
engv+=inum; \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
ans[ii]=f; \
|
||||||
|
ans[ii+inum]=tor; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef _DOUBLE_DOUBLE
|
||||||
|
texture<float4> pos_tex;
|
||||||
|
texture<float> q_tex;
|
||||||
|
texture<float4> mu_tex;
|
||||||
|
#else
|
||||||
|
texture<int4,1> pos_tex;
|
||||||
|
texture<int2> q_tex;
|
||||||
|
texture<int4,1> mu_tex;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define pos_tex x_
|
||||||
|
#define q_tex q_
|
||||||
|
#define mu_tex mu_
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__kernel void k_dipole_lj_sf(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||||
|
__global numtyp4* lj3, const int lj_types,
|
||||||
|
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||||
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
|
__global acctyp *engv, const int eflag,
|
||||||
|
const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, __global numtyp *q_ ,
|
||||||
|
__global numtyp4 *mu_,
|
||||||
|
__global numtyp *cutsq, const numtyp qqrd2e,
|
||||||
|
const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp sp_lj[8];
|
||||||
|
sp_lj[0]=sp_lj_in[0];
|
||||||
|
sp_lj[1]=sp_lj_in[1];
|
||||||
|
sp_lj[2]=sp_lj_in[2];
|
||||||
|
sp_lj[3]=sp_lj_in[3];
|
||||||
|
sp_lj[4]=sp_lj_in[4];
|
||||||
|
sp_lj[5]=sp_lj_in[5];
|
||||||
|
sp_lj[6]=sp_lj_in[6];
|
||||||
|
sp_lj[7]=sp_lj_in[7];
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp e_coul=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp4 tor;
|
||||||
|
tor.x=(acctyp)0;
|
||||||
|
tor.y=(acctyp)0;
|
||||||
|
tor.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||||
|
numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
|
||||||
|
int itype=ix.w;
|
||||||
|
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
int j=*nbor;
|
||||||
|
|
||||||
|
numtyp factor_lj, factor_coul;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
factor_coul = sp_lj[sbmask(j)+4];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
numtyp qj; fetch(qj,j,q_tex);
|
||||||
|
numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
|
||||||
|
int jtype=jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
int mtype=itype*lj_types+jtype;
|
||||||
|
if (rsq<cutsq[mtype]) {
|
||||||
|
numtyp r2inv=ucl_recip(rsq);
|
||||||
|
numtyp force_lj, r6inv;
|
||||||
|
numtyp rinv, r3inv, r5inv;
|
||||||
|
numtyp pre1, pre2, pre4;
|
||||||
|
numtyp pdotp, pidotr, pjdotr;
|
||||||
|
numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
|
||||||
|
numtyp4 aforcecoul, bforcecoul;
|
||||||
|
|
||||||
|
acctyp4 forcecoul, ticoul;
|
||||||
|
acctyp4 force;
|
||||||
|
|
||||||
|
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
|
||||||
|
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].z) {
|
||||||
|
r6inv = r2inv*r2inv*r2inv;
|
||||||
|
numtyp forceljcut = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
|
||||||
|
|
||||||
|
rcutlj2inv = ucl_recip(lj1[mtype].z);
|
||||||
|
rcutlj6inv = rcutlj2inv * rcutlj2inv * rcutlj2inv;
|
||||||
|
numtyp forceljsf = rcutlj6inv*(lj1[mtype].x*rcutlj6inv-lj1[mtype].y)*rcutlj2inv;
|
||||||
|
|
||||||
|
force_lj = factor_lj * (forceljcut - forceljsf);
|
||||||
|
} else force_lj = (numtyp)0.0;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].w) {
|
||||||
|
rinv = ucl_rsqrt(rsq);
|
||||||
|
rcutcoul2inv = ucl_recip(lj1[mtype].w);
|
||||||
|
|
||||||
|
// charge-charge
|
||||||
|
if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) {
|
||||||
|
r3inv = r2inv*rinv;
|
||||||
|
pre1 = qtmp*qj*rinv*(r2inv-rcutcoul2inv);
|
||||||
|
|
||||||
|
forcecoul.x += pre1*delx;
|
||||||
|
forcecoul.y += pre1*dely;
|
||||||
|
forcecoul.z += pre1*delz;
|
||||||
|
}
|
||||||
|
|
||||||
|
// dipole-dipole
|
||||||
|
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
|
||||||
|
r3inv = r2inv*rinv;
|
||||||
|
r5inv = r3inv*r2inv;
|
||||||
|
|
||||||
|
pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
|
||||||
|
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
|
||||||
|
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
|
||||||
|
|
||||||
|
afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv;
|
||||||
|
pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr);
|
||||||
|
aforcecoul.x = pre1*delx;
|
||||||
|
aforcecoul.y = pre1*dely;
|
||||||
|
aforcecoul.z = pre1*delz;
|
||||||
|
|
||||||
|
bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+
|
||||||
|
(numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv;
|
||||||
|
presf = (numtyp)2.0*r2inv*pidotr*pjdotr;
|
||||||
|
bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx);
|
||||||
|
bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely);
|
||||||
|
bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz);
|
||||||
|
|
||||||
|
forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x);
|
||||||
|
forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y);
|
||||||
|
forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z);
|
||||||
|
|
||||||
|
pre2 = (numtyp)3.0*bfac*r5inv*pjdotr;
|
||||||
|
pre4 = -bfac*r3inv;
|
||||||
|
|
||||||
|
numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
|
||||||
|
numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
|
||||||
|
numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
|
||||||
|
|
||||||
|
ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
|
||||||
|
ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
|
||||||
|
ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// dipole-charge
|
||||||
|
if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
|
||||||
|
r3inv = r2inv*rinv;
|
||||||
|
r5inv = r3inv*r2inv;
|
||||||
|
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
|
||||||
|
rcutcoul2inv=ucl_recip(lj1[mtype].w);
|
||||||
|
pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv);
|
||||||
|
pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv +
|
||||||
|
(numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
|
||||||
|
pre2 = qj*r3inv * pqfac;
|
||||||
|
|
||||||
|
forcecoul.x += pre2*mui.x - pre1*delx;
|
||||||
|
forcecoul.y += pre2*mui.y - pre1*dely;
|
||||||
|
forcecoul.z += pre2*mui.z - pre1*delz;
|
||||||
|
ticoul.x += pre2 * (mui.y*delz - mui.z*dely);
|
||||||
|
ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
|
||||||
|
ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// charge-dipole
|
||||||
|
if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
|
||||||
|
r3inv = r2inv*rinv;
|
||||||
|
r5inv = r3inv*r2inv;
|
||||||
|
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
|
||||||
|
rcutcoul2inv=ucl_recip(lj1[mtype].w);
|
||||||
|
pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv);
|
||||||
|
qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv +
|
||||||
|
(numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
|
||||||
|
pre2 = qtmp*r3inv * qpfac;
|
||||||
|
|
||||||
|
forcecoul.x += pre1*delx - pre2*muj.x;
|
||||||
|
forcecoul.y += pre1*dely - pre2*muj.y;
|
||||||
|
forcecoul.z += pre1*delz - pre2*muj.z;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
|
||||||
|
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
|
||||||
|
}
|
||||||
|
|
||||||
|
numtyp fq = factor_coul*qqrd2e;
|
||||||
|
force.x = fq*forcecoul.x + delx*force_lj;
|
||||||
|
force.y = fq*forcecoul.y + dely*force_lj;
|
||||||
|
force.z = fq*forcecoul.z + delz*force_lj;
|
||||||
|
f.x+=force.x;
|
||||||
|
f.y+=force.y;
|
||||||
|
f.z+=force.z;
|
||||||
|
tor.x+=fq*ticoul.x;
|
||||||
|
tor.y+=fq*ticoul.y;
|
||||||
|
tor.z+=fq*ticoul.z;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
acctyp e = (acctyp)0.0;
|
||||||
|
if (rsq < lj1[mtype].w) {
|
||||||
|
numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv);
|
||||||
|
e = qtmp*qj*rinv*fac*fac;
|
||||||
|
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
|
||||||
|
e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr);
|
||||||
|
if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0)
|
||||||
|
e += -qj*r3inv*pidotr * pqfac;
|
||||||
|
if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
|
||||||
|
e += qtmp*r3inv*pjdotr * qpfac;
|
||||||
|
e *= fq;
|
||||||
|
} else e = (acctyp)0.0;
|
||||||
|
e_coul += e;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].z) {
|
||||||
|
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) +
|
||||||
|
rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv -
|
||||||
|
(numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv +
|
||||||
|
rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv +
|
||||||
|
(numtyp)4.0*lj3[mtype].y);
|
||||||
|
energy+=factor_lj*e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*force.x;
|
||||||
|
virial[1] += dely*force.y;
|
||||||
|
virial[2] += delz*force.z;
|
||||||
|
virial[3] += delx*force.y;
|
||||||
|
virial[4] += delx*force.z;
|
||||||
|
virial[5] += dely*force.z;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // for nbor
|
||||||
|
store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||||
|
vflag,ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void k_dipole_lj_sf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||||
|
__global numtyp4* lj3_in,
|
||||||
|
__global numtyp* sp_lj_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
|
const int eflag, const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, __global numtyp *q_,
|
||||||
|
__global numtyp4 *mu_,
|
||||||
|
__global numtyp *_cutsq, const numtyp qqrd2e,
|
||||||
|
const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp sp_lj[8];
|
||||||
|
if (tid<8)
|
||||||
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
|
lj1[tid]=lj1_in[tid];
|
||||||
|
cutsq[tid]=_cutsq[tid];
|
||||||
|
if (eflag>0)
|
||||||
|
lj3[tid]=lj3_in[tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp e_coul=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp4 tor;
|
||||||
|
tor.x=(acctyp)0;
|
||||||
|
tor.y=(acctyp)0;
|
||||||
|
tor.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||||
|
numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
|
||||||
|
int iw=ix.w;
|
||||||
|
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
int j=*nbor;
|
||||||
|
|
||||||
|
numtyp factor_lj, factor_coul;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
factor_coul = sp_lj[sbmask(j)+4];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
numtyp qj; fetch(qj,j,q_tex);
|
||||||
|
numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
|
||||||
|
int mtype=itype+jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
if (rsq<cutsq[mtype]) {
|
||||||
|
numtyp r2inv=ucl_recip(rsq);
|
||||||
|
numtyp force_lj, r6inv;
|
||||||
|
numtyp rinv, r3inv, r5inv;
|
||||||
|
numtyp pre1, pre2, pre4;
|
||||||
|
numtyp pdotp, pidotr, pjdotr;
|
||||||
|
numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
|
||||||
|
numtyp4 aforcecoul, bforcecoul;
|
||||||
|
|
||||||
|
acctyp4 forcecoul, ticoul;
|
||||||
|
acctyp4 force;
|
||||||
|
|
||||||
|
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
|
||||||
|
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].z) {
|
||||||
|
r6inv = r2inv*r2inv*r2inv;
|
||||||
|
numtyp forceljcut = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
|
||||||
|
|
||||||
|
rcutlj2inv = ucl_recip(lj1[mtype].z);
|
||||||
|
rcutlj6inv = rcutlj2inv * rcutlj2inv * rcutlj2inv;
|
||||||
|
numtyp forceljsf = rcutlj6inv*(lj1[mtype].x*rcutlj6inv-lj1[mtype].y)*rcutlj2inv;
|
||||||
|
|
||||||
|
force_lj = factor_lj * (forceljcut - forceljsf);
|
||||||
|
} else force_lj = (numtyp)0.0;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].w) {
|
||||||
|
rinv = ucl_rsqrt(rsq);
|
||||||
|
rcutcoul2inv = ucl_recip(lj1[mtype].w);
|
||||||
|
|
||||||
|
// charge-charge
|
||||||
|
if (qtmp != (numtyp)0.0 && qj != (numtyp)0.0) {
|
||||||
|
r3inv = r2inv*rinv;
|
||||||
|
pre1 = qtmp*qj*rinv*(r2inv-rcutcoul2inv);
|
||||||
|
|
||||||
|
forcecoul.x += pre1*delx;
|
||||||
|
forcecoul.y += pre1*dely;
|
||||||
|
forcecoul.z += pre1*delz;
|
||||||
|
}
|
||||||
|
|
||||||
|
// dipole-dipole
|
||||||
|
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
|
||||||
|
r3inv = r2inv*rinv;
|
||||||
|
r5inv = r3inv*r2inv;
|
||||||
|
|
||||||
|
pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
|
||||||
|
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
|
||||||
|
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
|
||||||
|
|
||||||
|
afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv;
|
||||||
|
pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr);
|
||||||
|
aforcecoul.x = pre1*delx;
|
||||||
|
aforcecoul.y = pre1*dely;
|
||||||
|
aforcecoul.z = pre1*delz;
|
||||||
|
|
||||||
|
bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+
|
||||||
|
(numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv;
|
||||||
|
presf = (numtyp)2.0*r2inv*pidotr*pjdotr;
|
||||||
|
bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx);
|
||||||
|
bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely);
|
||||||
|
bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz);
|
||||||
|
|
||||||
|
forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x);
|
||||||
|
forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y);
|
||||||
|
forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z);
|
||||||
|
|
||||||
|
pre2 = (numtyp)3.0*bfac*r5inv*pjdotr;
|
||||||
|
pre4 = -bfac*r3inv;
|
||||||
|
|
||||||
|
numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y);
|
||||||
|
numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z);
|
||||||
|
numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x);
|
||||||
|
|
||||||
|
ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely);
|
||||||
|
ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz);
|
||||||
|
ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// dipole-charge
|
||||||
|
if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) {
|
||||||
|
r3inv = r2inv*rinv;
|
||||||
|
r5inv = r3inv*r2inv;
|
||||||
|
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
|
||||||
|
pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv);
|
||||||
|
pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv +
|
||||||
|
(numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
|
||||||
|
pre2 = qj*r3inv * pqfac;
|
||||||
|
|
||||||
|
forcecoul.x += pre2*mui.x - pre1*delx;
|
||||||
|
forcecoul.y += pre2*mui.y - pre1*dely;
|
||||||
|
forcecoul.z += pre2*mui.z - pre1*delz;
|
||||||
|
ticoul.x += pre2 * (mui.y*delz - mui.z*dely);
|
||||||
|
ticoul.y += pre2 * (mui.z*delx - mui.x*delz);
|
||||||
|
ticoul.z += pre2 * (mui.x*dely - mui.y*delx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// charge-dipole
|
||||||
|
if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) {
|
||||||
|
r3inv = r2inv*rinv;
|
||||||
|
r5inv = r3inv*r2inv;
|
||||||
|
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
|
||||||
|
|
||||||
|
pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv);
|
||||||
|
qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv +
|
||||||
|
(numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv);
|
||||||
|
pre2 = qtmp*r3inv * qpfac;
|
||||||
|
|
||||||
|
forcecoul.x += pre1*delx - pre2*muj.x;
|
||||||
|
forcecoul.y += pre1*dely - pre2*muj.y;
|
||||||
|
forcecoul.z += pre1*delz - pre2*muj.z;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
|
||||||
|
ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
|
||||||
|
}
|
||||||
|
|
||||||
|
numtyp fq = factor_coul*qqrd2e;
|
||||||
|
force.x = fq*forcecoul.x + delx*force_lj;
|
||||||
|
force.y = fq*forcecoul.y + dely*force_lj;
|
||||||
|
force.z = fq*forcecoul.z + delz*force_lj;
|
||||||
|
f.x+=force.x;
|
||||||
|
f.y+=force.y;
|
||||||
|
f.z+=force.z;
|
||||||
|
tor.x+=fq*ticoul.x;
|
||||||
|
tor.y+=fq*ticoul.y;
|
||||||
|
tor.z+=fq*ticoul.z;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
acctyp e = (acctyp)0.0;
|
||||||
|
if (rsq < lj1[mtype].w) {
|
||||||
|
numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv);
|
||||||
|
e = qtmp*qj*rinv*fac*fac;
|
||||||
|
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0)
|
||||||
|
e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr);
|
||||||
|
if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0)
|
||||||
|
e += -qj*r3inv*pidotr * pqfac;
|
||||||
|
if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0)
|
||||||
|
e += qtmp*r3inv*pjdotr * qpfac;
|
||||||
|
e *= fq;
|
||||||
|
} else e = (acctyp)0.0;
|
||||||
|
e_coul += e;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].z) {
|
||||||
|
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) +
|
||||||
|
rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv -
|
||||||
|
(numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv +
|
||||||
|
rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv +
|
||||||
|
(numtyp)4.0*lj3[mtype].y);
|
||||||
|
energy+=factor_lj*e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*force.x;
|
||||||
|
virial[1] += dely*force.y;
|
||||||
|
virial[2] += delz*force.z;
|
||||||
|
virial[3] += delx*force.y;
|
||||||
|
virial[4] += delx*force.z;
|
||||||
|
virial[5] += dely*force.z;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||||
|
vflag,ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,85 @@
|
||||||
|
/***************************************************************************
|
||||||
|
dipole_lj_sf.h
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the dipole/sf pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef LAL_DIPOLE_LJ_SF_H
|
||||||
|
#define LAL_DIPOLE_LJ_SF_H
|
||||||
|
|
||||||
|
#include "lal_base_dipole.h"
|
||||||
|
|
||||||
|
namespace LAMMPS_AL {
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
class DipoleLJSF : public BaseDipole<numtyp, acctyp> {
|
||||||
|
public:
|
||||||
|
DipoleLJSF();
|
||||||
|
~DipoleLJSF();
|
||||||
|
|
||||||
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
|
* \param cell_size cutoff + skin
|
||||||
|
* \param gpu_split fraction of particles handled by device
|
||||||
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double **host_cutsq, double **host_lj1,
|
||||||
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
|
double *host_special_lj,
|
||||||
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||||
|
double **host_cut_coulsq, double *host_special_coul,
|
||||||
|
const double qqrd2e);
|
||||||
|
|
||||||
|
/// Clear all host and device data
|
||||||
|
/** \note This is called at the beginning of the init() routine **/
|
||||||
|
void clear();
|
||||||
|
|
||||||
|
/// Returns memory usage on device per atom
|
||||||
|
int bytes_per_atom(const int max_nbors) const;
|
||||||
|
|
||||||
|
/// Total host memory used by library for pair style
|
||||||
|
double host_memory_usage() const;
|
||||||
|
|
||||||
|
// --------------------------- TYPE DATA --------------------------
|
||||||
|
|
||||||
|
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul
|
||||||
|
UCL_D_Vec<numtyp4> lj1;
|
||||||
|
/// lj3.x = lj3, lj3.y = lj4, lj3.z = cutsq
|
||||||
|
UCL_D_Vec<numtyp4> lj3;
|
||||||
|
/// cutsq
|
||||||
|
UCL_D_Vec<numtyp> cutsq;
|
||||||
|
/// Special LJ values [0-3] and Special Coul values [4-7]
|
||||||
|
UCL_D_Vec<numtyp> sp_lj;
|
||||||
|
|
||||||
|
/// If atom type constants fit in shared memory, use fast kernels
|
||||||
|
bool shared_types;
|
||||||
|
|
||||||
|
/// Number of atom types
|
||||||
|
int _lj_types;
|
||||||
|
|
||||||
|
numtyp _qqrd2e;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool _allocated;
|
||||||
|
void loop(const bool _eflag, const bool _vflag);
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,128 @@
|
||||||
|
/***************************************************************************
|
||||||
|
dipole_lj_sf_ext.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Functions for LAMMPS access to dipole/sf acceleration routines.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <cassert>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#include "lal_dipole_lj_sf.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
|
||||||
|
static DipoleLJSF<PRECISION,ACC_PRECISION> DPLSFMF;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Allocate memory on host and device and copy constants to device
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||||
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
|
double *special_lj, const int inum,
|
||||||
|
const int nall, const int max_nbors, const int maxspecial,
|
||||||
|
const double cell_size, int &gpu_mode, FILE *screen,
|
||||||
|
double **host_cut_ljsq, double **host_cut_coulsq,
|
||||||
|
double *host_special_coul, const double qqrd2e) {
|
||||||
|
DPLSFMF.clear();
|
||||||
|
gpu_mode=DPLSFMF.device->gpu_mode();
|
||||||
|
double gpu_split=DPLSFMF.device->particle_split();
|
||||||
|
int first_gpu=DPLSFMF.device->first_device();
|
||||||
|
int last_gpu=DPLSFMF.device->last_device();
|
||||||
|
int world_me=DPLSFMF.device->world_me();
|
||||||
|
int gpu_rank=DPLSFMF.device->gpu_rank();
|
||||||
|
int procs_per_gpu=DPLSFMF.device->procs_per_gpu();
|
||||||
|
|
||||||
|
DPLSFMF.device->init_message(screen,"dipole/sf",first_gpu,last_gpu);
|
||||||
|
|
||||||
|
bool message=false;
|
||||||
|
if (DPLSFMF.device->replica_me()==0 && screen)
|
||||||
|
message=true;
|
||||||
|
|
||||||
|
if (message) {
|
||||||
|
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
|
||||||
|
int init_ok=0;
|
||||||
|
if (world_me==0)
|
||||||
|
init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||||
|
host_lj4, special_lj, inum, nall, 300,
|
||||||
|
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||||
|
host_cut_coulsq, host_special_coul, qqrd2e);
|
||||||
|
|
||||||
|
DPLSFMF.device->world_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
|
if (message) {
|
||||||
|
if (last_gpu-first_gpu==0)
|
||||||
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
|
else
|
||||||
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
|
last_gpu,i);
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
if (gpu_rank==i && world_me!=0)
|
||||||
|
init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||||
|
special_lj, inum, nall, 300, maxspecial,
|
||||||
|
cell_size, gpu_split, screen, host_cut_ljsq,
|
||||||
|
host_cut_coulsq, host_special_coul, qqrd2e);
|
||||||
|
|
||||||
|
DPLSFMF.device->gpu_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
}
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"\n");
|
||||||
|
|
||||||
|
if (init_ok==0)
|
||||||
|
DPLSFMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
void dplsf_gpu_clear() {
|
||||||
|
DPLSFMF.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
int** dplsf_gpu_compute_n(const int ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type,
|
||||||
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
|
int **special, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
|
bool &success, double *host_q, double **host_mu,
|
||||||
|
double *boxlo, double *prd) {
|
||||||
|
return DPLSFMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||||
|
host_q, host_mu, boxlo, prd);
|
||||||
|
}
|
||||||
|
|
||||||
|
void dplsf_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
const double cpu_time, bool &success, double *host_q,
|
||||||
|
double **host_mu, const int nlocal, double *boxlo, double *prd) {
|
||||||
|
DPLSFMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
|
||||||
|
vflag,eatom,vatom,host_start,cpu_time,success,host_q,host_mu,
|
||||||
|
nlocal,boxlo,prd);
|
||||||
|
}
|
||||||
|
|
||||||
|
double dplsf_gpu_bytes() {
|
||||||
|
return DPLSFMF.host_memory_usage();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,146 @@
|
||||||
|
/***************************************************************************
|
||||||
|
gauss.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the gauss pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef USE_OPENCL
|
||||||
|
#include "gauss_cl.h"
|
||||||
|
#elif defined(USE_CUDART)
|
||||||
|
const char *gauss=0;
|
||||||
|
#else
|
||||||
|
#include "gauss_cubin.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "lal_gauss.h"
|
||||||
|
#include <cassert>
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
#define GaussT Gauss<numtyp, acctyp>
|
||||||
|
|
||||||
|
extern Device<PRECISION,ACC_PRECISION> device;
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
GaussT::Gauss() : BaseAtomic<numtyp,acctyp>(), _allocated(false) {
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
GaussT::~Gauss() {
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int GaussT::bytes_per_atom(const int max_nbors) const {
|
||||||
|
return this->bytes_per_atom_atomic(max_nbors);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int GaussT::init(const int ntypes,
|
||||||
|
double **host_cutsq, double **host_a,
|
||||||
|
double **host_b, double **host_offset,
|
||||||
|
double *host_special_lj, const int nlocal,
|
||||||
|
const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *_screen) {
|
||||||
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
|
_screen,gauss,"k_gauss");
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
|
int lj_types=ntypes;
|
||||||
|
shared_types=false;
|
||||||
|
int max_shared_types=this->device->max_shared_types();
|
||||||
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
|
shared_types=true;
|
||||||
|
}
|
||||||
|
_lj_types=lj_types;
|
||||||
|
|
||||||
|
// Allocate a host write buffer for data initialization
|
||||||
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
|
UCL_WRITE_OPTIMIZED);
|
||||||
|
|
||||||
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
|
host_write[i]=0.0;
|
||||||
|
|
||||||
|
gauss1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,gauss1,host_write,host_a,host_b,
|
||||||
|
host_cutsq,host_offset);
|
||||||
|
|
||||||
|
UCL_H_Vec<double> dview;
|
||||||
|
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
dview.view(host_special_lj,4,*(this->ucl_device));
|
||||||
|
ucl_copy(sp_lj,dview,false);
|
||||||
|
|
||||||
|
_allocated=true;
|
||||||
|
this->_max_bytes=gauss1.row_bytes()+sp_lj.row_bytes();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void GaussT::clear() {
|
||||||
|
if (!_allocated)
|
||||||
|
return;
|
||||||
|
_allocated=false;
|
||||||
|
|
||||||
|
gauss1.clear();
|
||||||
|
sp_lj.clear();
|
||||||
|
this->clear_atomic();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
double GaussT::host_memory_usage() const {
|
||||||
|
return this->host_memory_usage_atomic()+sizeof(Gauss<numtyp,acctyp>);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Calculate energies, forces, and torques
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void GaussT::loop(const bool _eflag, const bool _vflag) {
|
||||||
|
// Compute the block size and grid size to keep all cores busy
|
||||||
|
const int BX=this->block_size();
|
||||||
|
int eflag, vflag;
|
||||||
|
if (_eflag)
|
||||||
|
eflag=1;
|
||||||
|
else
|
||||||
|
eflag=0;
|
||||||
|
|
||||||
|
if (_vflag)
|
||||||
|
vflag=1;
|
||||||
|
else
|
||||||
|
vflag=0;
|
||||||
|
|
||||||
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
|
int ainum=this->ans->inum();
|
||||||
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
|
this->time_pair.start();
|
||||||
|
if (shared_types) {
|
||||||
|
this->k_pair_fast.set_size(GX,BX);
|
||||||
|
this->k_pair_fast.run(&this->atom->x, &gauss1, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||||
|
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||||
|
} else {
|
||||||
|
this->k_pair.set_size(GX,BX);
|
||||||
|
this->k_pair.run(&this->atom->x, &gauss1, &_lj_types, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||||
|
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||||
|
}
|
||||||
|
this->time_pair.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
template class Gauss<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,189 @@
|
||||||
|
// **************************************************************************
|
||||||
|
// gauss.cu
|
||||||
|
// -------------------
|
||||||
|
// Trung Dac Nguyen (ORNL)
|
||||||
|
//
|
||||||
|
// Device code for acceleration of the gauss pair style
|
||||||
|
//
|
||||||
|
// __________________________________________________________________________
|
||||||
|
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
// __________________________________________________________________________
|
||||||
|
//
|
||||||
|
// begin :
|
||||||
|
// email : nguyentd@ornl.gov
|
||||||
|
// ***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef NV_KERNEL
|
||||||
|
#include "lal_aux_fun1.h"
|
||||||
|
#ifndef _DOUBLE_DOUBLE
|
||||||
|
texture<float4> pos_tex;
|
||||||
|
#else
|
||||||
|
texture<int4,1> pos_tex;
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define pos_tex x_
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__kernel void k_gauss(__global numtyp4 *x_, __global numtyp4 *gauss1,
|
||||||
|
const int lj_types,
|
||||||
|
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||||
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
|
__global acctyp *engv, const int eflag,
|
||||||
|
const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp sp_lj[4];
|
||||||
|
sp_lj[0]=sp_lj_in[0];
|
||||||
|
sp_lj[1]=sp_lj_in[1];
|
||||||
|
sp_lj[2]=sp_lj_in[2];
|
||||||
|
sp_lj[3]=sp_lj_in[3];
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
int itype=ix.w;
|
||||||
|
|
||||||
|
numtyp factor_lj;
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
|
int j=*nbor;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
int jtype=jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
int mtype=itype*lj_types+jtype;
|
||||||
|
if (rsq<gauss1[mtype].z) {
|
||||||
|
numtyp r2inv = ucl_recip(rsq);
|
||||||
|
numtyp r = ucl_sqrt(rsq);
|
||||||
|
numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq*
|
||||||
|
ucl_exp(-gauss1[mtype].y*rsq)*r2inv*factor_lj;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) -
|
||||||
|
gauss1[mtype].w);
|
||||||
|
energy+=factor_lj*e;
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||||
|
ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void k_gauss_fast(__global numtyp4 *x_, __global numtyp4 *gauss1_in,
|
||||||
|
__global numtyp* sp_lj_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
|
const int eflag, const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp4 gauss1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp sp_lj[4];
|
||||||
|
if (tid<4)
|
||||||
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
|
gauss1[tid]=gauss1_in[tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
int iw=ix.w;
|
||||||
|
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
|
numtyp factor_lj;
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
|
int j=*nbor;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
int mtype=itype+jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
if (rsq<gauss1[mtype].z) {
|
||||||
|
numtyp r2inv = ucl_recip(rsq);
|
||||||
|
numtyp r = ucl_sqrt(rsq);
|
||||||
|
numtyp force = (numtyp)-2.0*gauss1[mtype].x*gauss1[mtype].y*rsq*
|
||||||
|
ucl_exp(-gauss1[mtype].y*rsq)*r2inv*factor_lj;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) -
|
||||||
|
gauss1[mtype].w);
|
||||||
|
energy+=factor_lj*e;
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||||
|
ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,77 @@
|
||||||
|
/***************************************************************************
|
||||||
|
gauss.h
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the gauss pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef LAL_GAUSS_H
|
||||||
|
#define LAL_GAYSS_H
|
||||||
|
|
||||||
|
#include "lal_base_atomic.h"
|
||||||
|
|
||||||
|
namespace LAMMPS_AL {
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
class Gauss : public BaseAtomic<numtyp, acctyp> {
|
||||||
|
public:
|
||||||
|
Gauss();
|
||||||
|
~Gauss();
|
||||||
|
|
||||||
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
|
* \param cell_size cutoff + skin
|
||||||
|
* \param gpu_split fraction of particles handled by device
|
||||||
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double **host_cutsq,
|
||||||
|
double **host_a, double **host_b, double **host_offset,
|
||||||
|
double *host_special_lj,
|
||||||
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *screen);
|
||||||
|
|
||||||
|
/// Clear all host and device data
|
||||||
|
/** \note This is called at the beginning of the init() routine **/
|
||||||
|
void clear();
|
||||||
|
|
||||||
|
/// Returns memory usage on device per atom
|
||||||
|
int bytes_per_atom(const int max_nbors) const;
|
||||||
|
|
||||||
|
/// Total host memory used by library for pair style
|
||||||
|
double host_memory_usage() const;
|
||||||
|
|
||||||
|
// --------------------------- TYPE DATA --------------------------
|
||||||
|
|
||||||
|
/// gauss1.x = a, gauss1.y = b, gauss1.z = cutsq, gauss1.w = offset
|
||||||
|
UCL_D_Vec<numtyp4> gauss1;
|
||||||
|
/// Special LJ values
|
||||||
|
UCL_D_Vec<numtyp> sp_lj;
|
||||||
|
|
||||||
|
/// If atom type constants fit in shared memory, use fast kernels
|
||||||
|
bool shared_types;
|
||||||
|
|
||||||
|
/// Number of atom types
|
||||||
|
int _lj_types;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool _allocated;
|
||||||
|
void loop(const bool _eflag, const bool _vflag);
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,120 @@
|
||||||
|
/***************************************************************************
|
||||||
|
gauss_ext.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Functions for LAMMPS access to gauss acceleration routines.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <cassert>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#include "lal_gauss.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
|
||||||
|
static Gauss<PRECISION,ACC_PRECISION> GLMF;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Allocate memory on host and device and copy constants to device
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
|
||||||
|
double **host_b, double **offset, double *special_lj,
|
||||||
|
const int inum, const int nall, const int max_nbors,
|
||||||
|
const int maxspecial,
|
||||||
|
const double cell_size, int &gpu_mode, FILE *screen) {
|
||||||
|
GLMF.clear();
|
||||||
|
gpu_mode=GLMF.device->gpu_mode();
|
||||||
|
double gpu_split=GLMF.device->particle_split();
|
||||||
|
int first_gpu=GLMF.device->first_device();
|
||||||
|
int last_gpu=GLMF.device->last_device();
|
||||||
|
int world_me=GLMF.device->world_me();
|
||||||
|
int gpu_rank=GLMF.device->gpu_rank();
|
||||||
|
int procs_per_gpu=GLMF.device->procs_per_gpu();
|
||||||
|
|
||||||
|
GLMF.device->init_message(screen,"gauss",first_gpu,last_gpu);
|
||||||
|
|
||||||
|
bool message=false;
|
||||||
|
if (GLMF.device->replica_me()==0 && screen)
|
||||||
|
message=true;
|
||||||
|
|
||||||
|
if (message) {
|
||||||
|
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
|
||||||
|
int init_ok=0;
|
||||||
|
if (world_me==0)
|
||||||
|
init_ok=GLMF.init(ntypes, cutsq, host_a, host_b,
|
||||||
|
offset, special_lj, inum, nall, 300,
|
||||||
|
maxspecial, cell_size, gpu_split, screen);
|
||||||
|
|
||||||
|
GLMF.device->world_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
|
if (message) {
|
||||||
|
if (last_gpu-first_gpu==0)
|
||||||
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
|
else
|
||||||
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
|
last_gpu,i);
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
if (gpu_rank==i && world_me!=0)
|
||||||
|
init_ok=GLMF.init(ntypes, cutsq, host_a, host_b,
|
||||||
|
offset, special_lj, inum, nall, 300, maxspecial,
|
||||||
|
cell_size, gpu_split, screen);
|
||||||
|
|
||||||
|
GLMF.device->gpu_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
}
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"\n");
|
||||||
|
|
||||||
|
if (init_ok==0)
|
||||||
|
GLMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gauss_gpu_clear() {
|
||||||
|
GLMF.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
int ** gauss_gpu_compute_n(const int ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type,
|
||||||
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
|
int **special, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
|
bool &success) {
|
||||||
|
return GLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gauss_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
const double cpu_time, bool &success) {
|
||||||
|
GLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||||
|
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
||||||
|
}
|
||||||
|
|
||||||
|
double gauss_gpu_bytes() {
|
||||||
|
return GLMF.host_memory_usage();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,168 @@
|
||||||
|
/***************************************************************************
|
||||||
|
lj_coul_debye.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the lj/cut/coul/debye pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef USE_OPENCL
|
||||||
|
#include "lj_coul_debye_cl.h"
|
||||||
|
#elif defined(USE_CUDART)
|
||||||
|
const char *lj_coul_debye=0;
|
||||||
|
#else
|
||||||
|
#include "lj_coul_debye_cubin.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "lal_lj_coul_debye.h"
|
||||||
|
#include <cassert>
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
#define LJCoulDebyeT LJCoulDebye<numtyp, acctyp>
|
||||||
|
|
||||||
|
extern Device<PRECISION,ACC_PRECISION> device;
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
LJCoulDebyeT::LJCoulDebye() : BaseCharge<numtyp,acctyp>(),
|
||||||
|
_allocated(false) {
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
LJCoulDebyeT::~LJCoulDebye() {
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int LJCoulDebyeT::bytes_per_atom(const int max_nbors) const {
|
||||||
|
return this->bytes_per_atom_atomic(max_nbors);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int LJCoulDebyeT::init(const int ntypes,
|
||||||
|
double **host_cutsq, double **host_lj1,
|
||||||
|
double **host_lj2, double **host_lj3,
|
||||||
|
double **host_lj4, double **host_offset,
|
||||||
|
double *host_special_lj, const int nlocal,
|
||||||
|
const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *_screen,
|
||||||
|
double **host_cut_ljsq, double **host_cut_coulsq,
|
||||||
|
double *host_special_coul, const double qqrd2e,
|
||||||
|
const double kappa) {
|
||||||
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
|
_screen,lj_coul_debye,"k_lj_debye");
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
|
int lj_types=ntypes;
|
||||||
|
shared_types=false;
|
||||||
|
int max_shared_types=this->device->max_shared_types();
|
||||||
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
|
shared_types=true;
|
||||||
|
}
|
||||||
|
_lj_types=lj_types;
|
||||||
|
|
||||||
|
// Allocate a host write buffer for data initialization
|
||||||
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
|
UCL_WRITE_OPTIMIZED);
|
||||||
|
|
||||||
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
|
host_write[i]=0.0;
|
||||||
|
|
||||||
|
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||||
|
host_cut_ljsq, host_cut_coulsq);
|
||||||
|
|
||||||
|
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||||
|
host_offset);
|
||||||
|
|
||||||
|
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
|
||||||
|
|
||||||
|
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
for (int i=0; i<4; i++) {
|
||||||
|
host_write[i]=host_special_lj[i];
|
||||||
|
host_write[i+4]=host_special_coul[i];
|
||||||
|
}
|
||||||
|
ucl_copy(sp_lj,host_write,8,false);
|
||||||
|
|
||||||
|
_qqrd2e=qqrd2e;
|
||||||
|
_kappa=kappa;
|
||||||
|
|
||||||
|
_allocated=true;
|
||||||
|
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
|
||||||
|
sp_lj.row_bytes();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void LJCoulDebyeT::clear() {
|
||||||
|
if (!_allocated)
|
||||||
|
return;
|
||||||
|
_allocated=false;
|
||||||
|
|
||||||
|
lj1.clear();
|
||||||
|
lj3.clear();
|
||||||
|
cutsq.clear();
|
||||||
|
sp_lj.clear();
|
||||||
|
this->clear_atomic();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
double LJCoulDebyeT::host_memory_usage() const {
|
||||||
|
return this->host_memory_usage_atomic()+sizeof(LJCoulDebye<numtyp,acctyp>);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Calculate energies, forces, and torques
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) {
|
||||||
|
// Compute the block size and grid size to keep all cores busy
|
||||||
|
const int BX=this->block_size();
|
||||||
|
int eflag, vflag;
|
||||||
|
if (_eflag)
|
||||||
|
eflag=1;
|
||||||
|
else
|
||||||
|
eflag=0;
|
||||||
|
|
||||||
|
if (_vflag)
|
||||||
|
vflag=1;
|
||||||
|
else
|
||||||
|
vflag=0;
|
||||||
|
|
||||||
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
|
int ainum=this->ans->inum();
|
||||||
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
|
this->time_pair.start();
|
||||||
|
if (shared_types) {
|
||||||
|
this->k_pair_fast.set_size(GX,BX);
|
||||||
|
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||||
|
&ainum, &nbor_pitch, &this->atom->q, &cutsq,
|
||||||
|
&_qqrd2e, &_kappa, &this->_threads_per_atom);
|
||||||
|
} else {
|
||||||
|
this->k_pair.set_size(GX,BX);
|
||||||
|
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||||
|
&ainum, &nbor_pitch, &this->atom->q, &cutsq,
|
||||||
|
&_qqrd2e, &_kappa, &this->_threads_per_atom);
|
||||||
|
}
|
||||||
|
this->time_pair.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
template class LJCoulDebye<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,256 @@
|
||||||
|
// **************************************************************************
|
||||||
|
// lj_coul_debye.cu
|
||||||
|
// -------------------
|
||||||
|
// Trung Dac Nguyen (ORNL)
|
||||||
|
//
|
||||||
|
// Device code for acceleration of the lj/cut/coul/debye pair style
|
||||||
|
//
|
||||||
|
// __________________________________________________________________________
|
||||||
|
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
// __________________________________________________________________________
|
||||||
|
//
|
||||||
|
// begin :
|
||||||
|
// email : nguyentd@ornl.gov
|
||||||
|
// ***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef NV_KERNEL
|
||||||
|
|
||||||
|
#include "lal_aux_fun1.h"
|
||||||
|
#ifndef _DOUBLE_DOUBLE
|
||||||
|
texture<float4> pos_tex;
|
||||||
|
texture<float> q_tex;
|
||||||
|
#else
|
||||||
|
texture<int4,1> pos_tex;
|
||||||
|
texture<int2> q_tex;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define pos_tex x_
|
||||||
|
#define q_tex q_
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__kernel void k_lj_debye_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||||
|
__global numtyp4* lj3, const int lj_types,
|
||||||
|
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||||
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
|
__global acctyp *engv, const int eflag,
|
||||||
|
const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, __global numtyp *q_ ,
|
||||||
|
__global numtyp *cutsq, const numtyp qqrd2e,
|
||||||
|
const numtyp kappa,
|
||||||
|
const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp sp_lj[8];
|
||||||
|
sp_lj[0]=sp_lj_in[0];
|
||||||
|
sp_lj[1]=sp_lj_in[1];
|
||||||
|
sp_lj[2]=sp_lj_in[2];
|
||||||
|
sp_lj[3]=sp_lj_in[3];
|
||||||
|
sp_lj[4]=sp_lj_in[4];
|
||||||
|
sp_lj[5]=sp_lj_in[5];
|
||||||
|
sp_lj[6]=sp_lj_in[6];
|
||||||
|
sp_lj[7]=sp_lj_in[7];
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp e_coul=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||||
|
int itype=ix.w;
|
||||||
|
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
int j=*nbor;
|
||||||
|
|
||||||
|
numtyp factor_lj, factor_coul;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
factor_coul = sp_lj[sbmask(j)+4];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
int jtype=jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
int mtype=itype*lj_types+jtype;
|
||||||
|
if (rsq<cutsq[mtype]) {
|
||||||
|
numtyp r2inv=ucl_recip(rsq);
|
||||||
|
numtyp forcecoul, force_lj, force, r6inv, r, rinv, screening;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].z) {
|
||||||
|
r6inv = r2inv*r2inv*r2inv;
|
||||||
|
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||||
|
} else
|
||||||
|
force_lj = (numtyp)0.0;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].w) {
|
||||||
|
r = ucl_sqrt(rsq);
|
||||||
|
rinv = ucl_recip(r);
|
||||||
|
fetch(screening,j,q_tex);
|
||||||
|
screening *= ucl_exp(-kappa*r);
|
||||||
|
forcecoul = qqrd2e*qtmp*(kappa+rinv)*screening*factor_coul;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
forcecoul = (numtyp)0.0;
|
||||||
|
|
||||||
|
force = (force_lj + forcecoul) * r2inv;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
if (rsq < lj1[mtype].z) {
|
||||||
|
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||||
|
energy+=factor_lj*(e-lj3[mtype].z);
|
||||||
|
}
|
||||||
|
if (rsq < lj1[mtype].w) {
|
||||||
|
e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||||
|
vflag,ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void k_lj_debye_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||||
|
__global numtyp4* lj3_in,
|
||||||
|
__global numtyp* sp_lj_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
|
const int eflag, const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, __global numtyp *q_,
|
||||||
|
__global numtyp *_cutsq, const numtyp qqrd2e,
|
||||||
|
const numtyp kappa,
|
||||||
|
const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp sp_lj[8];
|
||||||
|
if (tid<8)
|
||||||
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
|
lj1[tid]=lj1_in[tid];
|
||||||
|
cutsq[tid]=_cutsq[tid];
|
||||||
|
if (eflag>0)
|
||||||
|
lj3[tid]=lj3_in[tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp e_coul=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||||
|
int iw=ix.w;
|
||||||
|
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
int j=*nbor;
|
||||||
|
|
||||||
|
numtyp factor_lj, factor_coul;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
factor_coul = sp_lj[sbmask(j)+4];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
int mtype=itype+jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
if (rsq<cutsq[mtype]) {
|
||||||
|
numtyp r2inv=ucl_recip(rsq);
|
||||||
|
numtyp forcecoul, force_lj, force, r6inv, r, rinv, screening;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].z) {
|
||||||
|
r6inv = r2inv*r2inv*r2inv;
|
||||||
|
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||||
|
} else
|
||||||
|
force_lj = (numtyp)0.0;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].w) {
|
||||||
|
r = ucl_sqrt(rsq);
|
||||||
|
rinv = ucl_recip(r);
|
||||||
|
fetch(screening,j,q_tex);
|
||||||
|
screening *= ucl_exp(-kappa*r);
|
||||||
|
forcecoul = qqrd2e*qtmp*(kappa+rinv)*screening*factor_coul;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
forcecoul = (numtyp)0.0;
|
||||||
|
|
||||||
|
force = (force_lj + forcecoul) * r2inv;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
if (rsq < lj1[mtype].z) {
|
||||||
|
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||||
|
energy+=factor_lj*(e-lj3[mtype].z);
|
||||||
|
}
|
||||||
|
if (rsq < lj1[mtype].w) {
|
||||||
|
e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||||
|
vflag,ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,85 @@
|
||||||
|
/***************************************************************************
|
||||||
|
lj_coul_debye.h
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the lj/cut/coul/debye pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef LAL_LJ_COUL_DEBYE_H
|
||||||
|
#define LAL_LJ_COUL_DEBYE_H
|
||||||
|
|
||||||
|
#include "lal_base_charge.h"
|
||||||
|
|
||||||
|
namespace LAMMPS_AL {
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
class LJCoulDebye : public BaseCharge<numtyp, acctyp> {
|
||||||
|
public:
|
||||||
|
LJCoulDebye();
|
||||||
|
~LJCoulDebye();
|
||||||
|
|
||||||
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
|
* \param cell_size cutoff + skin
|
||||||
|
* \param gpu_split fraction of particles handled by device
|
||||||
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double **host_cutsq, double **host_lj1,
|
||||||
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
|
double **host_offset, double *host_special_lj,
|
||||||
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||||
|
double **host_cut_coulsq, double *host_special_coul,
|
||||||
|
const double qqrd2e, const double kappa);
|
||||||
|
|
||||||
|
/// Clear all host and device data
|
||||||
|
/** \note This is called at the beginning of the init() routine **/
|
||||||
|
void clear();
|
||||||
|
|
||||||
|
/// Returns memory usage on device per atom
|
||||||
|
int bytes_per_atom(const int max_nbors) const;
|
||||||
|
|
||||||
|
/// Total host memory used by library for pair style
|
||||||
|
double host_memory_usage() const;
|
||||||
|
|
||||||
|
// --------------------------- TYPE DATA --------------------------
|
||||||
|
|
||||||
|
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul
|
||||||
|
UCL_D_Vec<numtyp4> lj1;
|
||||||
|
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
|
||||||
|
UCL_D_Vec<numtyp4> lj3;
|
||||||
|
/// cutsq
|
||||||
|
UCL_D_Vec<numtyp> cutsq;
|
||||||
|
/// Special LJ values [0-3] and Special Coul values [4-7]
|
||||||
|
UCL_D_Vec<numtyp> sp_lj;
|
||||||
|
|
||||||
|
/// If atom type constants fit in shared memory, use fast kernels
|
||||||
|
bool shared_types;
|
||||||
|
|
||||||
|
/// Number of atom types
|
||||||
|
int _lj_types;
|
||||||
|
|
||||||
|
numtyp _qqrd2e,_kappa;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool _allocated;
|
||||||
|
void loop(const bool _eflag, const bool _vflag);
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,129 @@
|
||||||
|
/***************************************************************************
|
||||||
|
lj_coul_debye_ext.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Functions for LAMMPS access to lj/cut/coul/debye acceleration routines.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <cassert>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#include "lal_lj_coul_debye.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
|
||||||
|
static LJCoulDebye<PRECISION,ACC_PRECISION> LJCDMF;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Allocate memory on host and device and copy constants to device
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||||
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
|
double **offset, double *special_lj, const int inum,
|
||||||
|
const int nall, const int max_nbors, const int maxspecial,
|
||||||
|
const double cell_size, int &gpu_mode, FILE *screen,
|
||||||
|
double **host_cut_ljsq, double **host_cut_coulsq,
|
||||||
|
double *host_special_coul, const double qqrd2e,
|
||||||
|
const double kappa) {
|
||||||
|
LJCDMF.clear();
|
||||||
|
gpu_mode=LJCDMF.device->gpu_mode();
|
||||||
|
double gpu_split=LJCDMF.device->particle_split();
|
||||||
|
int first_gpu=LJCDMF.device->first_device();
|
||||||
|
int last_gpu=LJCDMF.device->last_device();
|
||||||
|
int world_me=LJCDMF.device->world_me();
|
||||||
|
int gpu_rank=LJCDMF.device->gpu_rank();
|
||||||
|
int procs_per_gpu=LJCDMF.device->procs_per_gpu();
|
||||||
|
|
||||||
|
LJCDMF.device->init_message(screen,"lj/cut/coul/debye",first_gpu,last_gpu);
|
||||||
|
|
||||||
|
bool message=false;
|
||||||
|
if (LJCDMF.device->replica_me()==0 && screen)
|
||||||
|
message=true;
|
||||||
|
|
||||||
|
if (message) {
|
||||||
|
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
|
||||||
|
int init_ok=0;
|
||||||
|
if (world_me==0)
|
||||||
|
init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||||
|
host_lj4, offset, special_lj, inum, nall, 300,
|
||||||
|
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||||
|
host_cut_coulsq, host_special_coul, qqrd2e, kappa);
|
||||||
|
|
||||||
|
LJCDMF.device->world_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
|
if (message) {
|
||||||
|
if (last_gpu-first_gpu==0)
|
||||||
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
|
else
|
||||||
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
|
last_gpu,i);
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
if (gpu_rank==i && world_me!=0)
|
||||||
|
init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||||
|
offset, special_lj, inum, nall, 300, maxspecial,
|
||||||
|
cell_size, gpu_split, screen, host_cut_ljsq,
|
||||||
|
host_cut_coulsq, host_special_coul, qqrd2e, kappa);
|
||||||
|
|
||||||
|
LJCDMF.device->gpu_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
}
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"\n");
|
||||||
|
|
||||||
|
if (init_ok==0)
|
||||||
|
LJCDMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ljcd_gpu_clear() {
|
||||||
|
LJCDMF.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
int** ljcd_gpu_compute_n(const int ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type,
|
||||||
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
|
int **special, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
|
bool &success, double *host_q, double *boxlo,
|
||||||
|
double *prd) {
|
||||||
|
return LJCDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||||
|
host_q, boxlo, prd);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ljcd_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
const double cpu_time, bool &success, double *host_q,
|
||||||
|
const int nlocal, double *boxlo, double *prd) {
|
||||||
|
LJCDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
|
||||||
|
vflag,eatom,vatom,host_start,cpu_time,success,host_q,
|
||||||
|
nlocal,boxlo,prd);
|
||||||
|
}
|
||||||
|
|
||||||
|
double ljcd_gpu_bytes() {
|
||||||
|
return LJCDMF.host_memory_usage();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,168 @@
|
||||||
|
/***************************************************************************
|
||||||
|
lj_dsf.cpp
|
||||||
|
-------------------
|
||||||
|
W. Michael Brown (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the lj/cut/coul/dsf pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin : 7/12/2012
|
||||||
|
email : brownw@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#if defined(USE_OPENCL)
|
||||||
|
#include "lj_dsf_cl.h"
|
||||||
|
#elif defined(USE_CUDART)
|
||||||
|
const char *lj_dsf=0;
|
||||||
|
#else
|
||||||
|
#include "lj_dsf_cubin.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "lal_lj_dsf.h"
|
||||||
|
#include <cassert>
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
#define LJDSFT LJDSF<numtyp, acctyp>
|
||||||
|
|
||||||
|
extern Device<PRECISION,ACC_PRECISION> device;
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
LJDSFT::LJDSF() : BaseCharge<numtyp,acctyp>(),
|
||||||
|
_allocated(false) {
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
LJDSFT::~LJDSF() {
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int LJDSFT::bytes_per_atom(const int max_nbors) const {
|
||||||
|
return this->bytes_per_atom_atomic(max_nbors);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1,
|
||||||
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
|
double **host_offset, double *host_special_lj,
|
||||||
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *_screen,
|
||||||
|
double **host_cut_ljsq, const double host_cut_coulsq,
|
||||||
|
double *host_special_coul, const double qqrd2e,
|
||||||
|
const double e_shift, const double f_shift,
|
||||||
|
const double alpha) {
|
||||||
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
|
_screen,lj_dsf,"k_lj_dsf");
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
|
_cut_coulsq=host_cut_coulsq;
|
||||||
|
_e_shift=e_shift;
|
||||||
|
_f_shift=f_shift;
|
||||||
|
_alpha=alpha;
|
||||||
|
|
||||||
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
|
int lj_types=ntypes;
|
||||||
|
shared_types=false;
|
||||||
|
int max_shared_types=this->device->max_shared_types();
|
||||||
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
|
shared_types=true;
|
||||||
|
}
|
||||||
|
_lj_types=lj_types;
|
||||||
|
|
||||||
|
// Allocate a host write buffer for data initialization
|
||||||
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
|
UCL_WRITE_OPTIMIZED);
|
||||||
|
|
||||||
|
for (int i=0; i<lj_types*lj_types; i++)
|
||||||
|
host_write[i]=0.0;
|
||||||
|
|
||||||
|
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||||
|
host_cut_ljsq, host_cutsq);
|
||||||
|
|
||||||
|
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||||
|
host_offset);
|
||||||
|
|
||||||
|
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
for (int i=0; i<4; i++) {
|
||||||
|
host_write[i]=host_special_lj[i];
|
||||||
|
host_write[i+4]=host_special_coul[i];
|
||||||
|
}
|
||||||
|
ucl_copy(sp_lj,host_write,8,false);
|
||||||
|
|
||||||
|
_qqrd2e=qqrd2e;
|
||||||
|
|
||||||
|
_allocated=true;
|
||||||
|
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void LJDSFT::clear() {
|
||||||
|
if (!_allocated)
|
||||||
|
return;
|
||||||
|
_allocated=false;
|
||||||
|
|
||||||
|
lj1.clear();
|
||||||
|
lj3.clear();
|
||||||
|
sp_lj.clear();
|
||||||
|
this->clear_atomic();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
double LJDSFT::host_memory_usage() const {
|
||||||
|
return this->host_memory_usage_atomic()+sizeof(LJDSF<numtyp,acctyp>);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Calculate energies, forces, and torques
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void LJDSFT::loop(const bool _eflag, const bool _vflag) {
|
||||||
|
// Compute the block size and grid size to keep all cores busy
|
||||||
|
const int BX=this->block_size();
|
||||||
|
int eflag, vflag;
|
||||||
|
if (_eflag)
|
||||||
|
eflag=1;
|
||||||
|
else
|
||||||
|
eflag=0;
|
||||||
|
|
||||||
|
if (_vflag)
|
||||||
|
vflag=1;
|
||||||
|
else
|
||||||
|
vflag=0;
|
||||||
|
|
||||||
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
|
int ainum=this->ans->inum();
|
||||||
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
|
this->time_pair.start();
|
||||||
|
if (shared_types) {
|
||||||
|
this->k_pair_fast.set_size(GX,BX);
|
||||||
|
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv, &eflag,
|
||||||
|
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||||
|
&_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
|
||||||
|
&this->_threads_per_atom);
|
||||||
|
} else {
|
||||||
|
this->k_pair.set_size(GX,BX);
|
||||||
|
this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv,
|
||||||
|
&eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||||
|
&_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha,
|
||||||
|
&this->_threads_per_atom);
|
||||||
|
}
|
||||||
|
this->time_pair.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
template class LJDSF<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,261 @@
|
||||||
|
// **************************************************************************
|
||||||
|
// lj_dsf.cu
|
||||||
|
// -------------------
|
||||||
|
// W. Michael Brown (ORNL)
|
||||||
|
//
|
||||||
|
// Device code for acceleration of the lj/cut/coul/dsf pair style
|
||||||
|
//
|
||||||
|
// __________________________________________________________________________
|
||||||
|
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
// __________________________________________________________________________
|
||||||
|
//
|
||||||
|
// begin : 7/12/2012
|
||||||
|
// email : brownw@ornl.gov
|
||||||
|
// ***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef NV_KERNEL
|
||||||
|
|
||||||
|
#include "lal_aux_fun1.h"
|
||||||
|
#ifndef _DOUBLE_DOUBLE
|
||||||
|
texture<float4> pos_tex;
|
||||||
|
texture<float> q_tex;
|
||||||
|
#else
|
||||||
|
texture<int4,1> pos_tex;
|
||||||
|
texture<int2> q_tex;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define pos_tex x_
|
||||||
|
#define q_tex q_
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define MY_PIS (acctyp)1.77245385090551602729
|
||||||
|
|
||||||
|
__kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||||
|
__global numtyp4* lj3, const int lj_types,
|
||||||
|
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||||
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
|
__global acctyp *engv, const int eflag,
|
||||||
|
const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, __global numtyp *q_ ,
|
||||||
|
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||||
|
const numtyp e_shift, const numtyp f_shift,
|
||||||
|
const numtyp alpha, const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp sp_lj[8];
|
||||||
|
sp_lj[0]=sp_lj_in[0];
|
||||||
|
sp_lj[1]=sp_lj_in[1];
|
||||||
|
sp_lj[2]=sp_lj_in[2];
|
||||||
|
sp_lj[3]=sp_lj_in[3];
|
||||||
|
sp_lj[4]=sp_lj_in[4];
|
||||||
|
sp_lj[5]=sp_lj_in[5];
|
||||||
|
sp_lj[6]=sp_lj_in[6];
|
||||||
|
sp_lj[7]=sp_lj_in[7];
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp e_coul=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||||
|
int itype=ix.w;
|
||||||
|
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
int j=*nbor;
|
||||||
|
|
||||||
|
numtyp factor_lj, factor_coul, r, prefactor, erfcc;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
factor_coul = sp_lj[sbmask(j)+4];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
int jtype=jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
int mtype=itype*lj_types+jtype;
|
||||||
|
if (rsq<lj1[mtype].w) {
|
||||||
|
numtyp r2inv=ucl_recip(rsq);
|
||||||
|
numtyp forcecoul, force_lj, force, r6inv;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].z) {
|
||||||
|
r6inv = r2inv*r2inv*r2inv;
|
||||||
|
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||||
|
} else
|
||||||
|
force_lj = (numtyp)0.0;
|
||||||
|
|
||||||
|
if (rsq < cut_coulsq) {
|
||||||
|
r = ucl_sqrt(rsq);
|
||||||
|
fetch(prefactor,j,q_tex);
|
||||||
|
prefactor *= factor_coul * qqrd2e*qtmp/r;
|
||||||
|
numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
|
||||||
|
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
|
||||||
|
erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
|
||||||
|
forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd +
|
||||||
|
rsq*f_shift);
|
||||||
|
} else
|
||||||
|
forcecoul = (numtyp)0.0;
|
||||||
|
|
||||||
|
force = (force_lj + forcecoul) * r2inv;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
if (rsq < cut_coulsq) {
|
||||||
|
numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift);
|
||||||
|
e_coul += e;
|
||||||
|
}
|
||||||
|
if (rsq < lj1[mtype].z) {
|
||||||
|
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||||
|
energy+=factor_lj*(e-lj3[mtype].z);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||||
|
vflag,ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||||
|
__global numtyp4* lj3_in, __global numtyp* sp_lj_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
|
const int eflag, const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, __global numtyp *q_,
|
||||||
|
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||||
|
const numtyp e_shift, const numtyp f_shift,
|
||||||
|
const numtyp alpha, const int t_per_atom) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp sp_lj[8];
|
||||||
|
if (tid<8)
|
||||||
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
|
lj1[tid]=lj1_in[tid];
|
||||||
|
if (eflag>0)
|
||||||
|
lj3[tid]=lj3_in[tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp e_coul=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||||
|
int iw=ix.w;
|
||||||
|
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
int j=*nbor;
|
||||||
|
|
||||||
|
numtyp factor_lj, factor_coul, r, prefactor, erfcc;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
factor_coul = sp_lj[sbmask(j)+4];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
int mtype=itype+jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
if (rsq<lj1[mtype].w) {
|
||||||
|
numtyp r2inv=ucl_recip(rsq);
|
||||||
|
numtyp forcecoul, force_lj, force, r6inv;
|
||||||
|
|
||||||
|
if (rsq < lj1[mtype].z) {
|
||||||
|
r6inv = r2inv*r2inv*r2inv;
|
||||||
|
force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||||
|
} else
|
||||||
|
force_lj = (numtyp)0.0;
|
||||||
|
|
||||||
|
if (rsq < cut_coulsq) {
|
||||||
|
r = ucl_sqrt(rsq);
|
||||||
|
fetch(prefactor,j,q_tex);
|
||||||
|
prefactor *= factor_coul * qqrd2e*qtmp/r;
|
||||||
|
numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
|
||||||
|
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
|
||||||
|
erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
|
||||||
|
forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd +
|
||||||
|
rsq*f_shift);
|
||||||
|
} else
|
||||||
|
forcecoul = (numtyp)0.0;
|
||||||
|
|
||||||
|
force = (force_lj + forcecoul) * r2inv;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
if (rsq < cut_coulsq) {
|
||||||
|
numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift);
|
||||||
|
e_coul += e;
|
||||||
|
}
|
||||||
|
if (rsq < lj1[mtype].z) {
|
||||||
|
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||||
|
energy+=factor_lj*(e-lj3[mtype].z);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||||
|
vflag,ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,85 @@
|
||||||
|
/***************************************************************************
|
||||||
|
lj_dsf.h
|
||||||
|
-------------------
|
||||||
|
W. Michael Brown (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the lj/cut/coul/dsf pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin : 7/12/2012
|
||||||
|
email : brownw@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef LAL_LJ_DSF_H
|
||||||
|
#define LAL_LJ_DSF_H
|
||||||
|
|
||||||
|
#include "lal_base_charge.h"
|
||||||
|
|
||||||
|
namespace LAMMPS_AL {
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
class LJDSF : public BaseCharge<numtyp, acctyp> {
|
||||||
|
public:
|
||||||
|
LJDSF();
|
||||||
|
~LJDSF();
|
||||||
|
|
||||||
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
|
* \param cell_size cutoff + skin
|
||||||
|
* \param gpu_split fraction of particles handled by device
|
||||||
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double **host_cutsq, double **host_lj1,
|
||||||
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
|
double **host_offset, double *host_special_lj,
|
||||||
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *screen, double **host_cut_ljsq,
|
||||||
|
const double host_cut_coulsq, double *host_special_coul,
|
||||||
|
const double qqrd2e, const double e_shift, const double f_shift,
|
||||||
|
const double alpha);
|
||||||
|
|
||||||
|
/// Clear all host and device data
|
||||||
|
/** \note This is called at the beginning of the init() routine **/
|
||||||
|
void clear();
|
||||||
|
|
||||||
|
/// Returns memory usage on device per atom
|
||||||
|
int bytes_per_atom(const int max_nbors) const;
|
||||||
|
|
||||||
|
/// Total host memory used by library for pair style
|
||||||
|
double host_memory_usage() const;
|
||||||
|
|
||||||
|
// --------------------------- TYPE DATA --------------------------
|
||||||
|
|
||||||
|
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq
|
||||||
|
UCL_D_Vec<numtyp4> lj1;
|
||||||
|
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
|
||||||
|
UCL_D_Vec<numtyp4> lj3;
|
||||||
|
/// Special LJ values [0-3] and Special Coul values [4-7]
|
||||||
|
UCL_D_Vec<numtyp> sp_lj;
|
||||||
|
|
||||||
|
/// If atom type constants fit in shared memory, use fast kernels
|
||||||
|
bool shared_types;
|
||||||
|
|
||||||
|
/// Number of atom types
|
||||||
|
int _lj_types;
|
||||||
|
|
||||||
|
numtyp _qqrd2e;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool _allocated;
|
||||||
|
numtyp _e_shift, _f_shift, _alpha, _cut_coulsq;
|
||||||
|
void loop(const bool _eflag, const bool _vflag);
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,132 @@
|
||||||
|
/***************************************************************************
|
||||||
|
lj_dsf_ext.cpp
|
||||||
|
-------------------
|
||||||
|
W. Michael Brown (ORNL)
|
||||||
|
|
||||||
|
Functions for LAMMPS access to lj/cut/coul/dsf acceleration routines.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin : 7/12/2012
|
||||||
|
email : brownw@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <cassert>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#include "lal_lj_dsf.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
|
||||||
|
static LJDSF<PRECISION,ACC_PRECISION> LJDMF;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Allocate memory on host and device and copy constants to device
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||||
|
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||||
|
double **offset, double *special_lj, const int inum,
|
||||||
|
const int nall, const int max_nbors, const int maxspecial,
|
||||||
|
const double cell_size, int &gpu_mode, FILE *screen,
|
||||||
|
double **host_cut_ljsq, const double host_cut_coulsq,
|
||||||
|
double *host_special_coul, const double qqrd2e,
|
||||||
|
const double e_shift, const double f_shift,
|
||||||
|
const double alpha) {
|
||||||
|
LJDMF.clear();
|
||||||
|
gpu_mode=LJDMF.device->gpu_mode();
|
||||||
|
double gpu_split=LJDMF.device->particle_split();
|
||||||
|
int first_gpu=LJDMF.device->first_device();
|
||||||
|
int last_gpu=LJDMF.device->last_device();
|
||||||
|
int world_me=LJDMF.device->world_me();
|
||||||
|
int gpu_rank=LJDMF.device->gpu_rank();
|
||||||
|
int procs_per_gpu=LJDMF.device->procs_per_gpu();
|
||||||
|
|
||||||
|
LJDMF.device->init_message(screen,"lj/cut/coul/dsf",first_gpu,last_gpu);
|
||||||
|
|
||||||
|
bool message=false;
|
||||||
|
if (LJDMF.device->replica_me()==0 && screen)
|
||||||
|
message=true;
|
||||||
|
|
||||||
|
if (message) {
|
||||||
|
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
|
||||||
|
int init_ok=0;
|
||||||
|
if (world_me==0)
|
||||||
|
init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||||
|
host_lj4, offset, special_lj, inum, nall, 300,
|
||||||
|
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||||
|
host_cut_coulsq, host_special_coul, qqrd2e, e_shift,
|
||||||
|
f_shift, alpha);
|
||||||
|
|
||||||
|
LJDMF.device->world_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
|
if (message) {
|
||||||
|
if (last_gpu-first_gpu==0)
|
||||||
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
|
else
|
||||||
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
|
last_gpu,i);
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
if (gpu_rank==i && world_me!=0)
|
||||||
|
init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||||
|
offset, special_lj, inum, nall, 300, maxspecial,
|
||||||
|
cell_size, gpu_split, screen, host_cut_ljsq,
|
||||||
|
host_cut_coulsq, host_special_coul, qqrd2e, e_shift,
|
||||||
|
f_shift, alpha);
|
||||||
|
|
||||||
|
LJDMF.device->gpu_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
}
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"\n");
|
||||||
|
|
||||||
|
if (init_ok==0)
|
||||||
|
LJDMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ljd_gpu_clear() {
|
||||||
|
LJDMF.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
int** ljd_gpu_compute_n(const int ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type,
|
||||||
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
|
int **special, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
|
bool &success, double *host_q, double *boxlo,
|
||||||
|
double *prd) {
|
||||||
|
return LJDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||||
|
host_q, boxlo, prd);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ljd_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||||
|
double **host_x, int *host_type, int *ilist, int *numj,
|
||||||
|
int **firstneigh, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
const double cpu_time, bool &success, double *host_q,
|
||||||
|
const int nlocal, double *boxlo, double *prd) {
|
||||||
|
LJDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
|
||||||
|
vflag,eatom,vatom,host_start,cpu_time,success,host_q,
|
||||||
|
nlocal,boxlo,prd);
|
||||||
|
}
|
||||||
|
|
||||||
|
double ljd_gpu_bytes() {
|
||||||
|
return LJDMF.host_memory_usage();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,315 @@
|
||||||
|
/***************************************************************************
|
||||||
|
yukawa_colloid.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the yukawa/colloid pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef USE_OPENCL
|
||||||
|
#include "yukawa_colloid_cl.h"
|
||||||
|
#elif defined(USE_CUDART)
|
||||||
|
const char *yukawa_colloid=0;
|
||||||
|
#else
|
||||||
|
#include "yukawa_colloid_cubin.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "lal_yukawa_colloid.h"
|
||||||
|
#include <cassert>
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
#define YukawaColloidT YukawaColloid<numtyp, acctyp>
|
||||||
|
|
||||||
|
extern Device<PRECISION,ACC_PRECISION> device;
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
YukawaColloidT::YukawaColloid() : BaseAtomic<numtyp,acctyp>(),
|
||||||
|
_allocated(false), _max_rad_size(0) {
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
YukawaColloidT::~YukawaColloid() {
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int YukawaColloidT::bytes_per_atom(const int max_nbors) const {
|
||||||
|
return this->bytes_per_atom_atomic(max_nbors);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int YukawaColloidT::init(const int ntypes,
|
||||||
|
double **host_cutsq, double **host_a,
|
||||||
|
double **host_offset, double *host_special_lj, const int nlocal,
|
||||||
|
const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *_screen, const double kappa) {
|
||||||
|
int success;
|
||||||
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
|
_screen,yukawa_colloid,"k_yukawa_colloid");
|
||||||
|
if (success!=0)
|
||||||
|
return success;
|
||||||
|
|
||||||
|
// allocate rad
|
||||||
|
|
||||||
|
bool cpuview=false;
|
||||||
|
if (this->ucl_device->device_type()==UCL_CPU)
|
||||||
|
cpuview=true;
|
||||||
|
|
||||||
|
int ef_nall=nall;
|
||||||
|
if (ef_nall==0)
|
||||||
|
ef_nall=2000;
|
||||||
|
|
||||||
|
_max_rad_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
|
||||||
|
host_rad.alloc(_max_rad_size,*(this->ucl_device));
|
||||||
|
if (cpuview)
|
||||||
|
dev_rad.view(host_rad);
|
||||||
|
else
|
||||||
|
dev_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY);
|
||||||
|
|
||||||
|
rad_tex.get_texture(*(this->pair_program),"rad_tex");
|
||||||
|
rad_tex.bind_float(dev_rad,1);
|
||||||
|
|
||||||
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
|
int lj_types=ntypes;
|
||||||
|
shared_types=false;
|
||||||
|
int max_shared_types=this->device->max_shared_types();
|
||||||
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
|
lj_types=max_shared_types;
|
||||||
|
shared_types=true;
|
||||||
|
}
|
||||||
|
_lj_types=lj_types;
|
||||||
|
|
||||||
|
_kappa = kappa;
|
||||||
|
|
||||||
|
// Allocate a host write buffer for data initialization
|
||||||
|
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||||
|
UCL_WRITE_OPTIMIZED);
|
||||||
|
|
||||||
|
for (int i=0; i<lj_types*lj_types*32; i++)
|
||||||
|
host_write[i]=(numtyp)0.0;
|
||||||
|
|
||||||
|
coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,
|
||||||
|
host_offset,host_cutsq);
|
||||||
|
|
||||||
|
UCL_H_Vec<double> dview;
|
||||||
|
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
|
dview.view(host_special_lj,4,*(this->ucl_device));
|
||||||
|
ucl_copy(sp_lj,dview,false);
|
||||||
|
|
||||||
|
_allocated=true;
|
||||||
|
this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void YukawaColloidT::clear() {
|
||||||
|
if (!_allocated)
|
||||||
|
return;
|
||||||
|
_allocated=false;
|
||||||
|
|
||||||
|
coeff.clear();
|
||||||
|
sp_lj.clear();
|
||||||
|
|
||||||
|
host_rad.clear();
|
||||||
|
dev_rad.clear();
|
||||||
|
|
||||||
|
this->clear_atomic();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
double YukawaColloidT::host_memory_usage() const {
|
||||||
|
return this->host_memory_usage_atomic()+sizeof(YukawaColloid<numtyp,acctyp>);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Copy nbor list from host if necessary and then compute atom energies/forces
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void YukawaColloidT::compute(const int f_ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type, int *ilist,
|
||||||
|
int *numj, int **firstneigh, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
const double cpu_time, bool &success, double *rad) {
|
||||||
|
this->acc_timers();
|
||||||
|
|
||||||
|
// ------------------- Resize rad array --------------------------
|
||||||
|
|
||||||
|
if (nall>_max_rad_size) {
|
||||||
|
dev_rad.clear();
|
||||||
|
host_rad.clear();
|
||||||
|
|
||||||
|
_max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||||
|
host_rad.alloc(_max_rad_size,*(this->ucl_device));
|
||||||
|
|
||||||
|
if (this->ucl_device->device_type()==UCL_CPU) {
|
||||||
|
if (sizeof(numtyp)==sizeof(double)) {
|
||||||
|
host_rad.view((numtyp*)rad,nall,*(this->ucl_device));
|
||||||
|
dev_rad.view(host_rad);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
dev_rad.alloc(_max_rad_size,*(this->ucl_device));
|
||||||
|
}
|
||||||
|
|
||||||
|
rad_tex.bind_float(dev_rad,1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
if (inum_full==0) {
|
||||||
|
host_start=0;
|
||||||
|
// Make sure textures are correct if realloc by a different hybrid style
|
||||||
|
this->resize_atom(0,nall,success);
|
||||||
|
this->zero_timers();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ago=this->hd_balancer.ago_first(f_ago);
|
||||||
|
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
|
||||||
|
this->ans->inum(inum);
|
||||||
|
host_start=inum;
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------
|
||||||
|
|
||||||
|
if (ago==0) {
|
||||||
|
this->reset_nbors(nall, inum, ilist, numj, firstneigh, success);
|
||||||
|
if (!success)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this->atom->cast_x_data(host_x,host_type);
|
||||||
|
this->cast_rad_data(rad);
|
||||||
|
this->hd_balancer.start_timer();
|
||||||
|
this->atom->add_x_data(host_x,host_type);
|
||||||
|
this->add_rad_data();
|
||||||
|
|
||||||
|
this->loop(eflag,vflag);
|
||||||
|
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||||
|
this->device->add_ans_object(this->ans);
|
||||||
|
this->hd_balancer.stop_timer();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Reneighbor on GPU and then compute per-atom densities
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall,
|
||||||
|
double **host_x, int *host_type, double *sublo,
|
||||||
|
double *subhi, int *tag, int **nspecial,
|
||||||
|
int **special, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **jnum, const double cpu_time, bool &success,
|
||||||
|
double *rad) {
|
||||||
|
this->acc_timers();
|
||||||
|
|
||||||
|
// ------------------- Resize rad array ----------------------------
|
||||||
|
|
||||||
|
if (nall>_max_rad_size) {
|
||||||
|
dev_rad.clear();
|
||||||
|
host_rad.clear();
|
||||||
|
|
||||||
|
_max_rad_size=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||||
|
host_rad.alloc(_max_rad_size,*(this->ucl_device));
|
||||||
|
|
||||||
|
if (this->ucl_device->device_type()==UCL_CPU) {
|
||||||
|
if (sizeof(numtyp)==sizeof(double)) {
|
||||||
|
host_rad.view((numtyp*)rad,nall,*(this->ucl_device));
|
||||||
|
dev_rad.view(host_rad);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
dev_rad.alloc(_max_rad_size,*(this->ucl_device));
|
||||||
|
}
|
||||||
|
|
||||||
|
rad_tex.bind_float(dev_rad,1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------
|
||||||
|
|
||||||
|
if (inum_full==0) {
|
||||||
|
host_start=0;
|
||||||
|
// Make sure textures are correct if realloc by a different hybrid style
|
||||||
|
this->resize_atom(0,nall,success);
|
||||||
|
this->zero_timers();
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// load balance, returning the atom count on the device (inum)
|
||||||
|
this->hd_balancer.balance(cpu_time);
|
||||||
|
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
|
||||||
|
this->ans->inum(inum);
|
||||||
|
host_start=inum;
|
||||||
|
|
||||||
|
// Build neighbor list on GPU if necessary
|
||||||
|
if (ago==0) {
|
||||||
|
this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||||
|
sublo, subhi, tag, nspecial, special, success);
|
||||||
|
if (!success)
|
||||||
|
return NULL;
|
||||||
|
this->cast_rad_data(rad);
|
||||||
|
this->hd_balancer.start_timer();
|
||||||
|
} else {
|
||||||
|
this->atom->cast_x_data(host_x,host_type);
|
||||||
|
this->cast_rad_data(rad);
|
||||||
|
this->hd_balancer.start_timer();
|
||||||
|
this->atom->add_x_data(host_x,host_type);
|
||||||
|
}
|
||||||
|
this->add_rad_data();
|
||||||
|
*ilist=this->nbor->host_ilist.begin();
|
||||||
|
*jnum=this->nbor->host_acc.begin();
|
||||||
|
|
||||||
|
this->loop(eflag,vflag);
|
||||||
|
this->ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||||
|
this->device->add_ans_object(this->ans);
|
||||||
|
this->hd_balancer.stop_timer();
|
||||||
|
|
||||||
|
return this->nbor->host_jlist.begin()-host_start;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Calculate per-atom energies and forces
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
void YukawaColloidT::loop(const bool _eflag, const bool _vflag) {
|
||||||
|
// Compute the block size and grid size to keep all cores busy
|
||||||
|
const int BX=this->block_size();
|
||||||
|
int eflag, vflag;
|
||||||
|
if (_eflag)
|
||||||
|
eflag=1;
|
||||||
|
else
|
||||||
|
eflag=0;
|
||||||
|
|
||||||
|
if (_vflag)
|
||||||
|
vflag=1;
|
||||||
|
else
|
||||||
|
vflag=0;
|
||||||
|
|
||||||
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
|
(BX/this->_threads_per_atom)));
|
||||||
|
|
||||||
|
int ainum=this->ans->inum();
|
||||||
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
|
this->time_pair.start();
|
||||||
|
if (shared_types) {
|
||||||
|
this->k_pair_fast.set_size(GX,BX);
|
||||||
|
this->k_pair_fast.run(&this->atom->x, &dev_rad, &coeff, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||||
|
&ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
|
||||||
|
} else {
|
||||||
|
this->k_pair.set_size(GX,BX);
|
||||||
|
this->k_pair.run(&this->atom->x, &dev_rad, &coeff, &_lj_types, &sp_lj,
|
||||||
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
|
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||||
|
&ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
|
||||||
|
}
|
||||||
|
this->time_pair.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
template class YukawaColloid<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,202 @@
|
||||||
|
// **************************************************************************
|
||||||
|
// yukawa_colloid.cu
|
||||||
|
// -------------------
|
||||||
|
// Trung Dac Nguyen (ORNL)
|
||||||
|
//
|
||||||
|
// Device code for acceleration of the yukawa/colloid pair style
|
||||||
|
//
|
||||||
|
// __________________________________________________________________________
|
||||||
|
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
// __________________________________________________________________________
|
||||||
|
//
|
||||||
|
// begin :
|
||||||
|
// email : nguyentd@ornl.gov
|
||||||
|
// ***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef NV_KERNEL
|
||||||
|
|
||||||
|
#include "lal_aux_fun1.h"
|
||||||
|
#ifndef _DOUBLE_DOUBLE
|
||||||
|
texture<float4> pos_tex;
|
||||||
|
texture<float> rad_tex;
|
||||||
|
#else
|
||||||
|
texture<int4,1> pos_tex;
|
||||||
|
texture<int2> rad_tex;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define pos_tex x_
|
||||||
|
#define rad_tex rad_
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__kernel void k_yukawa_colloid(__global numtyp4 *x_, __global numtyp *rad_,
|
||||||
|
__global numtyp4 *coeff, const int lj_types,
|
||||||
|
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||||
|
__global int *dev_packed, __global acctyp4 *ans,
|
||||||
|
__global acctyp *engv, const int eflag,
|
||||||
|
const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, const int t_per_atom,
|
||||||
|
const numtyp kappa) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp sp_lj[4];
|
||||||
|
sp_lj[0]=sp_lj_in[0];
|
||||||
|
sp_lj[1]=sp_lj_in[1];
|
||||||
|
sp_lj[2]=sp_lj_in[2];
|
||||||
|
sp_lj[3]=sp_lj_in[3];
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
numtyp radi; fetch(radi,i,rad_tex);
|
||||||
|
int itype=ix.w;
|
||||||
|
|
||||||
|
numtyp factor_lj;
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
|
int j=*nbor;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
numtyp radj; fetch(radj,j,rad_tex);
|
||||||
|
int jtype=jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
int mtype=itype*lj_types+jtype;
|
||||||
|
if (rsq<coeff[mtype].z) {
|
||||||
|
numtyp r = ucl_sqrt(rsq);
|
||||||
|
numtyp rinv = ucl_recip(r);
|
||||||
|
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
|
||||||
|
numtyp force = coeff[mtype].x * screening;
|
||||||
|
|
||||||
|
force = factor_lj*force * rinv;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
numtyp e=coeff[mtype].x/kappa * screening;
|
||||||
|
energy+=factor_lj*(e-coeff[mtype].y);
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||||
|
ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void k_yukawa_colloid_fast(__global numtyp4 *x_, __global numtyp *rad_,
|
||||||
|
__global numtyp4 *coeff_in, __global numtyp *sp_lj_in,
|
||||||
|
__global int *dev_nbor, __global int *dev_packed,
|
||||||
|
__global acctyp4 *ans, __global acctyp *engv,
|
||||||
|
const int eflag, const int vflag, const int inum,
|
||||||
|
const int nbor_pitch, const int t_per_atom,
|
||||||
|
const numtyp kappa) {
|
||||||
|
int tid, ii, offset;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
__local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
|
__local numtyp sp_lj[4];
|
||||||
|
if (tid<4)
|
||||||
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
|
coeff[tid]=coeff_in[tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
acctyp energy=(acctyp)0;
|
||||||
|
acctyp4 f;
|
||||||
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
|
acctyp virial[6];
|
||||||
|
for (int i=0; i<6; i++)
|
||||||
|
virial[i]=(acctyp)0;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
__global int *nbor, *list_end;
|
||||||
|
int i, numj, n_stride;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,list_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
numtyp radi; fetch(radi,i,rad_tex);
|
||||||
|
int iw=ix.w;
|
||||||
|
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
|
numtyp factor_lj;
|
||||||
|
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||||
|
|
||||||
|
int j=*nbor;
|
||||||
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
numtyp radj; fetch(radj,j,rad_tex);
|
||||||
|
int mtype=itype+jx.w;
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
if (rsq<coeff[mtype].z) {
|
||||||
|
numtyp r = ucl_sqrt(rsq);
|
||||||
|
numtyp rinv = ucl_recip(r);
|
||||||
|
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
|
||||||
|
numtyp force = coeff[mtype].x * screening;
|
||||||
|
|
||||||
|
force = factor_lj*force * rinv;
|
||||||
|
|
||||||
|
f.x+=delx*force;
|
||||||
|
f.y+=dely*force;
|
||||||
|
f.z+=delz*force;
|
||||||
|
|
||||||
|
if (eflag>0) {
|
||||||
|
numtyp e=coeff[mtype].x/kappa * screening;
|
||||||
|
energy+=factor_lj*(e-coeff[mtype].y);
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
virial[0] += delx*delx*force;
|
||||||
|
virial[1] += dely*dely*force;
|
||||||
|
virial[2] += delz*delz*force;
|
||||||
|
virial[3] += delx*dely*force;
|
||||||
|
virial[4] += delx*delz*force;
|
||||||
|
virial[5] += dely*delz*force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||||
|
ans,engv);
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,127 @@
|
||||||
|
/***************************************************************************
|
||||||
|
yukawa_colloid.h
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Class for acceleration of the yukawa/colloid pair style.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef LAL_YUKAWA_COLLOID_H
|
||||||
|
#define LAL_YUKAWA_COLLOID_H
|
||||||
|
|
||||||
|
#include "lal_base_atomic.h"
|
||||||
|
|
||||||
|
namespace LAMMPS_AL {
|
||||||
|
|
||||||
|
template <class numtyp, class acctyp>
|
||||||
|
class YukawaColloid : public BaseAtomic<numtyp, acctyp> {
|
||||||
|
public:
|
||||||
|
YukawaColloid();
|
||||||
|
~YukawaColloid();
|
||||||
|
|
||||||
|
/// Clear any previous data and set up for a new LAMMPS run
|
||||||
|
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||||
|
* \param cell_size cutoff + skin
|
||||||
|
* \param gpu_split fraction of particles handled by device
|
||||||
|
*
|
||||||
|
* Returns:
|
||||||
|
* - 0 if successfull
|
||||||
|
* - -1 if fix gpu not found
|
||||||
|
* - -3 if there is an out of memory error
|
||||||
|
* - -4 if the GPU library was not compiled for GPU
|
||||||
|
* - -5 Double precision is not supported on card **/
|
||||||
|
int init(const int ntypes, double **host_cutsq,
|
||||||
|
double **host_a, double **host_offset, double *host_special_lj,
|
||||||
|
const int nlocal, const int nall, const int max_nbors,
|
||||||
|
const int maxspecial, const double cell_size,
|
||||||
|
const double gpu_split, FILE *screen, const double kappa);
|
||||||
|
|
||||||
|
inline void cast_rad_data(double* rad) {
|
||||||
|
int nall = this->atom->nall();
|
||||||
|
if (this->ucl_device->device_type()==UCL_CPU) {
|
||||||
|
if (sizeof(numtyp)==sizeof(double)) {
|
||||||
|
host_rad.view((numtyp*)rad,nall,*(this->ucl_device));
|
||||||
|
dev_rad.view(host_rad);
|
||||||
|
} else {
|
||||||
|
for (int i=0; i<nall; i++) host_rad[i]=rad[i];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (sizeof(numtyp)==sizeof(double))
|
||||||
|
memcpy(host_rad.begin(),rad,nall*sizeof(numtyp));
|
||||||
|
else {
|
||||||
|
for (int i=0; i<nall; i++) host_rad[i]=rad[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy rad to device asynchronously
|
||||||
|
inline void add_rad_data() {
|
||||||
|
ucl_copy(dev_rad,host_rad,this->atom->nall(),true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clear all host and device data
|
||||||
|
/** \note This is called at the beginning of the init() routine **/
|
||||||
|
void clear();
|
||||||
|
|
||||||
|
/// Returns memory usage on device per atom
|
||||||
|
int bytes_per_atom(const int max_nbors) const;
|
||||||
|
|
||||||
|
/// Total host memory used by library for pair style
|
||||||
|
double host_memory_usage() const;
|
||||||
|
|
||||||
|
/// Pair loop with host neighboring
|
||||||
|
void compute(const int f_ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type,
|
||||||
|
int *ilist, int *numj, int **firstneigh,
|
||||||
|
const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
const double cpu_time, bool &success, double *rad);
|
||||||
|
|
||||||
|
/// Pair loop with device neighboring
|
||||||
|
int** compute(const int ago, const int inum_full, const int nall,
|
||||||
|
double **host_x, int *host_type, double *sublo,
|
||||||
|
double *subhi, int *tag, int **nspecial,
|
||||||
|
int **special, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
|
bool &success, double *rad);
|
||||||
|
|
||||||
|
// --------------------------- TEXTURES -----------------------------
|
||||||
|
UCL_Texture rad_tex;
|
||||||
|
|
||||||
|
// --------------------------- TYPE DATA --------------------------
|
||||||
|
|
||||||
|
/// coeff.x = a, coeff.y = offset, coeff.z = cutsq
|
||||||
|
UCL_D_Vec<numtyp4> coeff;
|
||||||
|
/// Special LJ values
|
||||||
|
UCL_D_Vec<numtyp> sp_lj;
|
||||||
|
|
||||||
|
/// If atom type constants fit in shared memory, use fast kernels
|
||||||
|
bool shared_types;
|
||||||
|
|
||||||
|
/// Number of atom types
|
||||||
|
int _lj_types;
|
||||||
|
|
||||||
|
int _max_rad_size;
|
||||||
|
|
||||||
|
numtyp _kappa;
|
||||||
|
|
||||||
|
/// Per-atom arrays
|
||||||
|
UCL_H_Vec<numtyp> host_rad;
|
||||||
|
UCL_D_Vec<numtyp> dev_rad;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool _allocated;
|
||||||
|
void loop(const bool _eflag, const bool _vflag);
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,123 @@
|
||||||
|
/***************************************************************************
|
||||||
|
yukawa_colloid_ext.cpp
|
||||||
|
-------------------
|
||||||
|
Trung Dac Nguyen (ORNL)
|
||||||
|
|
||||||
|
Functions for LAMMPS access to colloid acceleration routines.
|
||||||
|
|
||||||
|
__________________________________________________________________________
|
||||||
|
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||||
|
__________________________________________________________________________
|
||||||
|
|
||||||
|
begin :
|
||||||
|
email : nguyentd@ornl.gov
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <cassert>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#include "lal_yukawa_colloid.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace LAMMPS_AL;
|
||||||
|
|
||||||
|
static YukawaColloid<PRECISION,ACC_PRECISION> YKCOLLMF;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Allocate memory on host and device and copy constants to device
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
|
||||||
|
double **host_offset, double *special_lj, const int inum,
|
||||||
|
const int nall, const int max_nbors, const int maxspecial,
|
||||||
|
const double cell_size, int &gpu_mode, FILE *screen,
|
||||||
|
const double kappa) {
|
||||||
|
YKCOLLMF.clear();
|
||||||
|
gpu_mode=YKCOLLMF.device->gpu_mode();
|
||||||
|
double gpu_split=YKCOLLMF.device->particle_split();
|
||||||
|
int first_gpu=YKCOLLMF.device->first_device();
|
||||||
|
int last_gpu=YKCOLLMF.device->last_device();
|
||||||
|
int world_me=YKCOLLMF.device->world_me();
|
||||||
|
int gpu_rank=YKCOLLMF.device->gpu_rank();
|
||||||
|
int procs_per_gpu=YKCOLLMF.device->procs_per_gpu();
|
||||||
|
|
||||||
|
YKCOLLMF.device->init_message(screen,"yukawa/colloid",first_gpu,last_gpu);
|
||||||
|
|
||||||
|
bool message=false;
|
||||||
|
if (YKCOLLMF.device->replica_me()==0 && screen)
|
||||||
|
message=true;
|
||||||
|
|
||||||
|
if (message) {
|
||||||
|
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
|
||||||
|
int init_ok=0;
|
||||||
|
if (world_me==0)
|
||||||
|
init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj,
|
||||||
|
inum, nall, 300, maxspecial, cell_size, gpu_split,
|
||||||
|
screen, kappa);
|
||||||
|
|
||||||
|
YKCOLLMF.device->world_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
|
||||||
|
for (int i=0; i<procs_per_gpu; i++) {
|
||||||
|
if (message) {
|
||||||
|
if (last_gpu-first_gpu==0)
|
||||||
|
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||||
|
else
|
||||||
|
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||||
|
last_gpu,i);
|
||||||
|
fflush(screen);
|
||||||
|
}
|
||||||
|
if (gpu_rank==i && world_me!=0)
|
||||||
|
init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj,
|
||||||
|
inum, nall, 300, maxspecial, cell_size, gpu_split,
|
||||||
|
screen, kappa);
|
||||||
|
|
||||||
|
YKCOLLMF.device->gpu_barrier();
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"Done.\n");
|
||||||
|
}
|
||||||
|
if (message)
|
||||||
|
fprintf(screen,"\n");
|
||||||
|
|
||||||
|
if (init_ok==0)
|
||||||
|
YKCOLLMF.estimate_gpu_overhead();
|
||||||
|
return init_ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ykcolloid_gpu_clear() {
|
||||||
|
YKCOLLMF.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
int ** ykcolloid_gpu_compute_n(const int ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type,
|
||||||
|
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||||
|
int **special, const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
int **ilist, int **jnum, const double cpu_time,
|
||||||
|
bool &success, double *host_rad) {
|
||||||
|
return YKCOLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||||
|
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||||
|
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||||
|
host_rad);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ykcolloid_gpu_compute(const int ago, const int inum_full,
|
||||||
|
const int nall, double **host_x, int *host_type,
|
||||||
|
int *ilist, int *numj, int **firstneigh,
|
||||||
|
const bool eflag, const bool vflag,
|
||||||
|
const bool eatom, const bool vatom, int &host_start,
|
||||||
|
const double cpu_time, bool &success, double *host_rad) {
|
||||||
|
YKCOLLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||||
|
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,
|
||||||
|
success,host_rad);
|
||||||
|
}
|
||||||
|
|
||||||
|
double ykcolloid_gpu_bytes() {
|
||||||
|
return YKCOLLMF.host_memory_usage();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue