diff --git a/lib/gpu/geryon/ucl_matrix.h b/lib/gpu/geryon/ucl_matrix.h new file mode 100644 index 0000000000..803cd78b4c --- /dev/null +++ b/lib/gpu/geryon/ucl_matrix.h @@ -0,0 +1,224 @@ +/*************************************************************************** + ucl_matrix.h + ------------------- + W. Michael Brown + + Matrix Container on Host + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Thu May 10 2012 + copyright : (C) 2012 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + This software is distributed under the Simplified BSD License. + ----------------------------------------------------------------------- */ + +// Only allow this file to be included by CUDA and OpenCL specific headers +#ifdef _UCL_MAT_ALLOW + +/// Matrix S-Object +template +class UCL_Matrix { + public: + // Traits for copying data + // MEM_TYPE is 0 for device, 1 for host, and 2 for image + enum traits { + DATA_TYPE = _UCL_DATA_ID::id, + MEM_TYPE = 1, + PADDED = 0, + ROW_MAJOR = 1, + VECTOR = 0 + }; + typedef hosttype data_type; + + /// Host Allocation + UCL_H_Mat host; + + /// Device Allocation + UCL_D_Mat device; + + UCL_Matrix() { } + ~UCL_Matrix() { } + + /// Construct with specied number of rows and columns + /** \sa alloc() **/ + UCL_Matrix(const size_t rows, const size_t cols, UCL_Device &acc, + const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, + const enum UCL_MEMOPT kind2=UCL_READ_WRITE) + { _ucl_s_obj_help< ucl_same_type::ans >:: + alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); } + + /// Set up host matrix with specied # of rows/cols and reserve memory + /** The kind1 parameter controls memory pinning as follows: + * - UCL_NOT_PINNED - Memory is not pinned + * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) + * - UCL_RW_OPTIMIZED - Memory can be pinned + * The kind2 parameter controls memory optimizations as follows: + * - UCL_READ_WRITE - Specify that you will read and write in kernels + * - UCL_WRITE_ONLY - Specify that you will only write in kernels + * - UCL_READ_ONLY - Specify that you will only read in kernels + * \note When passing a command queue instead of a device, the device + * allocation is always performed. Even if the device shares memory + * with the host. + * \param cq Default command queue for operations copied from another mat + * \return UCL_SUCCESS if the memory allocation is successful **/ + template + inline int alloc(const size_t rows, const size_t cols, mat_type &cq, + const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, + const enum UCL_MEMOPT kind2=UCL_READ_WRITE) + { return _ucl_s_obj_help< ucl_same_type::ans >:: + alloc(host,device,_buffer,rows,cols,cq,kind1,kind2); } + + /// Set up host matrix with specied # of rows/cols and reserve memory + /** The kind1 parameter controls memory pinning as follows: + * - UCL_NOT_PINNED - Memory is not pinned + * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) + * - UCL_RW_OPTIMIZED - Memory can be pinned + * The kind2 parameter controls memory optimizations as follows: + * - UCL_READ_WRITE - Specify that you will read and write in kernels + * - UCL_WRITE_ONLY - Specify that you will only write in kernels + * - UCL_READ_ONLY - Specify that you will only read in kernels + * \param device Used to get the default command queue for operations + * \return UCL_SUCCESS if the memory allocation is successful **/ + inline int alloc(const size_t rows, const size_t cols, UCL_Device &acc, + const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, + const enum UCL_MEMOPT kind2=UCL_READ_WRITE) + { return _ucl_s_obj_help< ucl_same_type::ans >:: + alloc(host,device,_buffer,rows,cols,acc,kind1,kind2); } + + /// Free memory and set size to 0 + inline void clear() + { host.clear(); device.clear(); } + + /// Resize the allocation to contain cols elements + inline int resize(const int rows, const int cols) { + assert(host.kind()!=UCL_VIEW); + int err=host.resize(rows,cols); + if (err!=UCL_SUCCESS) + return err; + return _ucl_s_obj_help< ucl_same_type::ans >:: + dev_resize(device,host,_buffer,rows,cols); + } + + /// Resize (only if bigger) the allocation to contain cols elements + inline int resize_ib(const int new_rows, const int new_cols) + { if (new_rows>rows() || new_cols>cols()) return resize(new_rows,new_cols); + else return UCL_SUCCESS; } + + /// Set each element to zero + inline void zero() { host.zero(); device.zero(); } + + /// Set first n elements to zero + inline void zero(const int n) { host.zero(n); device.zero(n); } + + /// Get the number of elements + inline size_t numel() const { return host.numel(); } + /// Get the number of rows + inline size_t rows() const { return host.rows(); } + /// Get the number of columns + inline size_t cols() const { return host.cols(); } + /// Get the memory usage (bytes) of the s-object (including any buffers) + inline size_t host_mem_usage() + { return host.row_bytes()*host.rows()+_buffer.row_bytes()*_buffer.rows(); } + /// Get the memory usage (bytes) of the s-object (including any buffers) + inline size_t device_mem_usage() + { return device.row_bytes()*device.rows(); } + + /// Get element at index i + inline hosttype & operator[](const int i) { return host[i]; } + /// Get element at index i + inline const hosttype & operator[](const int i) const { return host[i]; } + /// 2D access (row should always be 0) + inline hosttype & operator()(const int row, const int col) + { return host(row,col); } + /// 2D access (row should always be 0) + inline const hosttype & operator()(const int row, const int col) const + { return host(row,col); } + + /// Returns pointer to memory pointer for allocation on host + inline hosttype ** host_ptr() { return host.host_ptr(); } + + /// Return the default command queue/stream associated with this data + inline command_queue & cq() { return host.cq(); } + /// Block until command_queue associated with matrix is complete + inline void sync() { host.sync(); } + + ///Get the size of a row on the host (including any padding) in elements + inline size_t row_size() const { return host.row_size(); } + /// Get the size of a row on the host(including any padding) in bytes + inline size_t row_bytes() const { return host.row_bytes(); } + /// Get the size on the host in bytes of 1 element + inline int element_size() const { return sizeof(hosttype); } + + + /// Update the allocation on the host asynchronously + inline void update_host() + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(host,device,_buffer,true); } + /// Update the allocation on the host (true for asynchronous copy) + inline void update_host(const bool async) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(host,device,_buffer,async); } + /// Update the allocation on the host (using command queue) + inline void update_host(command_queue &cq) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(host,device,_buffer,cq); } + /// Update the first n elements on the host (true for asynchronous copy) + inline void update_host(const int n, const bool async) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(host,device,n,_buffer,async); } + /// Update the first n elements on the host (using command queue) + inline void update_host(const int n, command_queue &cq) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(host,device,n,_buffer,cq); } + /// Update slice on the host (true for asynchronous copy) + inline void update_host(const int rows, const int cols, const bool async) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(host,device,rows,cols,_buffer,async); } + /// Update slice on the host (using command queue) + inline void update_host(const int rows, const int cols, command_queue &cq) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(host,device,rows,cols,_buffer,cq); } + + + /// Update the allocation on the device asynchronously + inline void update_device() + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(device,host,_buffer,true); } + /// Update the allocation on the device (true for asynchronous copy) + inline void update_device(const bool async) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(device,host,_buffer,async); } + /// Update the allocation on the device (using command queue) + inline void update_device(command_queue &cq) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(device,host,_buffer,cq); } + /// Update the first n elements on the device (true for asynchronous copy) + inline void update_device(const int n, const bool async) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(device,host,n,_buffer,async); } + /// Update the first n elements on the device (using command queue) + inline void update_device(const int n, command_queue &cq) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(device,host,n,_buffer,cq); } + /// Update slice on the device (true for asynchronous copy) + inline void update_device(const int rows, const int cols, const bool async) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(device,host,rows,cols,_buffer,async); } + /// Update slice on the device (using command queue) + inline void update_device(const int rows, const int cols, command_queue &cq) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(device,host,rows,cols,_buffer,cq); } + + + private: + UCL_H_Mat _buffer; +}; + +#endif + diff --git a/lib/gpu/geryon/ucl_s_obj_help.h b/lib/gpu/geryon/ucl_s_obj_help.h new file mode 100644 index 0000000000..ea772d3608 --- /dev/null +++ b/lib/gpu/geryon/ucl_s_obj_help.h @@ -0,0 +1,272 @@ +/*************************************************************************** + ucl_s_obj_help.h + ------------------- + W. Michael Brown + + Helper routines for allocating memory for s-objects and performing + host/device updates. (Different routines depending on whether the + same type is used on the host and device). + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Mon May 14 2012 + copyright : (C) 2012 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + This software is distributed under the Simplified BSD License. + ----------------------------------------------------------------------- */ + +template struct _ucl_s_obj_help; + +// Host and device containers are same type +// -- Don't need casting buffers +// -- Can potentially use same memory if shared by accelerator +template <> struct _ucl_s_obj_help<1> { + template + static inline int alloc(t1 &host, t2 &device, t3 &_buffer, + const int cols, UCL_Device &acc, + const enum UCL_MEMOPT kind1, + const enum UCL_MEMOPT kind2) { + int e1; + e1=host.alloc(cols,acc,kind1); + if (e1!=UCL_SUCCESS) + return e1; + if (acc.shared_memory()) { + device.view(host); + return UCL_SUCCESS; + } else + return device.alloc(cols,acc,kind2); + } + + template + static inline int alloc(t1 &host, t2 &device, t3 &_buffer, + const int cols, mat_type &cq, + const enum UCL_MEMOPT kind1, + const enum UCL_MEMOPT kind2) { + int e1; + e1=host.alloc(cols,cq,kind1); + if (e1!=UCL_SUCCESS) + return e1; + return device.alloc(cols,cq,kind2); + } + + template + static inline int alloc(t1 &host, t2 &device, t3 &_buffer, + const int rows, const int cols, UCL_Device &acc, + const enum UCL_MEMOPT kind1, + const enum UCL_MEMOPT kind2) { + int e1; + e1=host.alloc(rows,cols,acc,kind1); + if (e1!=UCL_SUCCESS) + return e1; + if (acc.shared_memory()) { + device.view(host); + return UCL_SUCCESS; + } else + return device.alloc(rows,cols,acc,kind2); + } + + template + static inline int alloc(t1 &host, t2 &device, t3 &_buffer, + const int rows, const int cols, mat_type &cq, + const enum UCL_MEMOPT kind1, + const enum UCL_MEMOPT kind2) { + int e1; + e1=host.alloc(rows,cols,cq,kind1); + if (e1!=UCL_SUCCESS) + return e1; + return device.alloc(rows,cols,cq,kind2); + } + + template + static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) { + ucl_copy(dst,src,async); + } + + template + static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) { + ucl_copy(dst,src,cq); + } + + template + static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, + const bool async) { + ucl_copy(dst,src,cols,async); + } + + template + static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, + command_queue &cq) { + ucl_copy(dst,src,cols,cq); + } + + template + static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, + t3 &buffer, const bool async) { + ucl_copy(dst,src,rows,cols,async); + } + + template + static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, + t3 &buffer, command_queue &cq) { + ucl_copy(dst,src,rows,cols,cq); + } + + template + static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) { + if (device.kind()==UCL_VIEW) { + device.view(host); + return UCL_SUCCESS; + } else + return device.resize(cols); + } + + template + static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows, + const int cols) { + if (device.kind()==UCL_VIEW) { + device.view(host); + return UCL_SUCCESS; + } else + return device.resize(rows,cols); + } +}; + +// Host and device containers are different types +template struct _ucl_s_obj_help { + template + static inline int alloc(t1 &host, t2 &device, t3 &_buffer, + const int cols, UCL_Device &acc, + const enum UCL_MEMOPT kind1, + const enum UCL_MEMOPT kind2) { + int e1; + e1=host.alloc(cols,acc,UCL_NOT_PINNED); + if (e1!=UCL_SUCCESS) + return e1; + e1=_buffer.alloc(cols,acc,kind1); + if (e1!=UCL_SUCCESS) + return e1; + + if (acc.shared_memory()) { + device.view(_buffer); + return UCL_SUCCESS; + } else + return device.alloc(cols,acc,kind2); + } + + template + static inline int alloc(t1 &host, t2 &device, t3 &_buffer, + const int cols, mat_type &cq, + const enum UCL_MEMOPT kind1, + const enum UCL_MEMOPT kind2) { + int e1; + e1=host.alloc(cols,cq,UCL_NOT_PINNED); + if (e1!=UCL_SUCCESS) + return e1; + e1=_buffer.alloc(cols,cq,kind1); + if (e1!=UCL_SUCCESS) + return e1; + return device.alloc(cols,cq,kind2); + } + + template + static inline int alloc(t1 &host, t2 &device, t3 &_buffer, + const int rows, const int cols, UCL_Device &acc, + const enum UCL_MEMOPT kind1, + const enum UCL_MEMOPT kind2) { + int e1; + e1=host.alloc(rows,cols,acc,UCL_NOT_PINNED); + if (e1!=UCL_SUCCESS) + return e1; + e1=_buffer.alloc(rows,cols,acc,kind1); + if (e1!=UCL_SUCCESS) + return e1; + + if (acc.shared_memory()) { + device.view(_buffer); + return UCL_SUCCESS; + } else + return device.alloc(rows,cols,acc,kind2); + } + + template + static inline int alloc(t1 &host, t2 &device, t3 &_buffer, + const int rows, const int cols, mat_type &cq, + const enum UCL_MEMOPT kind1, + const enum UCL_MEMOPT kind2) { + int e1; + e1=host.alloc(rows,cols,cq,UCL_NOT_PINNED); + if (e1!=UCL_SUCCESS) + return e1; + e1=_buffer.alloc(rows,cols,cq,kind1); + if (e1!=UCL_SUCCESS) + return e1; + return device.alloc(rows,cols,cq,kind2); + } + + template + static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) { + ucl_cast_copy(dst,src,buffer,async); + } + + template + static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) { + ucl_cast_copy(dst,src,buffer,cq); + } + + template + static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, + const bool async) { + ucl_cast_copy(dst,src,cols,buffer,async); + } + + template + static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, + command_queue &cq) { + ucl_cast_copy(dst,src,cols,buffer,cq); + } + + template + static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, + t3 &buffer, const bool async) { + ucl_cast_copy(dst,src,rows,cols,buffer,async); + } + + template + static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, + t3 &buffer, command_queue &cq) { + ucl_cast_copy(dst,src,rows,cols,buffer,cq); + } + + template + static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) { + int err=buff.resize(cols); + if (err!=UCL_SUCCESS) + return err; + + if (device.kind()==UCL_VIEW) { + device.view(buff); + return UCL_SUCCESS; + } else + return device.resize(cols); + } + + template + static inline int dev_resize(t1 &device, t2 &host, t3 &buff, const int rows, + const int cols) { + int err=buff.resize(rows,cols); + if (err!=UCL_SUCCESS) + return err; + + if (device.kind()==UCL_VIEW) { + device.view(buff); + return UCL_SUCCESS; + } else + return device.resize(rows,cols); + } + +}; diff --git a/lib/gpu/geryon/ucl_vector.h b/lib/gpu/geryon/ucl_vector.h new file mode 100644 index 0000000000..01346127c2 --- /dev/null +++ b/lib/gpu/geryon/ucl_vector.h @@ -0,0 +1,223 @@ +/*************************************************************************** + ucl_vector.h + ------------------- + W. Michael Brown + + Vector Container on Host + + __________________________________________________________________________ + This file is part of the Geryon Unified Coprocessor Library (UCL) + __________________________________________________________________________ + + begin : Thu May 10 2012 + copyright : (C) 2012 by W. Michael Brown + email : brownw@ornl.gov + ***************************************************************************/ + +/* ----------------------------------------------------------------------- + This software is distributed under the Simplified BSD License. + ----------------------------------------------------------------------- */ + +// Only allow this file to be included by CUDA and OpenCL specific headers +#ifdef _UCL_MAT_ALLOW + +/// Row Vector S-Object +template +class UCL_Vector { + public: + // Traits for copying data + // MEM_TYPE is 0 for device, 1 for host, and 2 for image + enum traits { + DATA_TYPE = _UCL_DATA_ID::id, + MEM_TYPE = 1, + PADDED = 0, + ROW_MAJOR = 1, + VECTOR = 1 + }; + typedef hosttype data_type; + + /// Host Allocation + UCL_H_Vec host; + + /// Device Allocation + UCL_D_Vec device; + + UCL_Vector() { } + ~UCL_Vector() { } + + /// Construct with n columns + /** \sa alloc() **/ + UCL_Vector(const size_t cols, UCL_Device &acc, + const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, + const enum UCL_MEMOPT kind2=UCL_READ_WRITE) + { _ucl_s_obj_help< ucl_same_type::ans >:: + alloc(host,device,_buffer,cols,acc,kind1,kind2); } + + /// Set up the vector with 'cols' columns and reserve memory + /** The kind1 parameter controls memory pinning as follows: + * - UCL_NOT_PINNED - Memory is not pinned + * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) + * - UCL_RW_OPTIMIZED - Memory can be pinned + * The kind2 parameter controls memory optimizations as follows: + * - UCL_READ_WRITE - Specify that you will read and write in kernels + * - UCL_WRITE_ONLY - Specify that you will only write in kernels + * - UCL_READ_ONLY - Specify that you will only read in kernels + * \note When passing a command queue instead of a device, the device + * allocation is always performed. Even if the device shares memory + * with the host. + * \param cq Default command queue for operations copied from another mat + * \return UCL_SUCCESS if the memory allocation is successful **/ + template + inline int alloc(const size_t cols, mat_type &cq, + const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, + const enum UCL_MEMOPT kind2=UCL_READ_WRITE) + { return _ucl_s_obj_help< ucl_same_type::ans >:: + alloc(host,device,_buffer,cols,cq,kind1,kind2); } + + /// Set up host vector with 'cols' columns and reserve memory + /** The kind1 parameter controls memory pinning as follows: + * - UCL_NOT_PINNED - Memory is not pinned + * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined) + * - UCL_RW_OPTIMIZED - Memory can be pinned + * The kind2 parameter controls memory optimizations as follows: + * - UCL_READ_WRITE - Specify that you will read and write in kernels + * - UCL_WRITE_ONLY - Specify that you will only write in kernels + * - UCL_READ_ONLY - Specify that you will only read in kernels + * \param device Used to get the default command queue for operations + * \return UCL_SUCCESS if the memory allocation is successful **/ + inline int alloc(const size_t cols, UCL_Device &acc, + const enum UCL_MEMOPT kind1=UCL_RW_OPTIMIZED, + const enum UCL_MEMOPT kind2=UCL_READ_WRITE) + { return _ucl_s_obj_help< ucl_same_type::ans >:: + alloc(host,device,_buffer,cols,acc,kind1,kind2); } + + /// Free memory and set size to 0 + inline void clear() + { host.clear(); device.clear(); } + + /// Resize the allocation to contain cols elements + inline int resize(const int cols) { + assert(host.kind()!=UCL_VIEW); + int err=host.resize(cols); + if (err!=UCL_SUCCESS) + return err; + return _ucl_s_obj_help< ucl_same_type::ans >:: + dev_resize(device,host,_buffer,cols); + } + + /// Resize (only if bigger) the allocation to contain cols elements + inline int resize_ib(const int new_cols) + { if (new_cols>cols()) return resize(new_cols); else return UCL_SUCCESS; } + + /// Set each element to zero + inline void zero() { host.zero(); device.zero(); } + + /// Set first n elements to zero + inline void zero(const int n) { host.zero(n); device.zero(n); } + + /// Get the number of elements + inline size_t numel() const { return host.numel(); } + /// Get the number of rows + inline size_t rows() const { return host.rows(); } + /// Get the number of columns + inline size_t cols() const { return host.cols(); } + /// Get the memory usage (bytes) of the s-object (including any buffers) + inline size_t host_mem_usage() + { return host.row_bytes()+_buffer.row_bytes(); } + /// Get the memory usage (bytes) of the s-object (including any buffers) + inline size_t device_mem_usage() + { return device.row_bytes(); } + + + /// Get element at index i + inline hosttype & operator[](const int i) { return host[i]; } + /// Get element at index i + inline const hosttype & operator[](const int i) const { return host[i]; } + /// 2D access (row should always be 0) + inline hosttype & operator()(const int row, const int col) + { return host[col]; } + /// 2D access (row should always be 0) + inline const hosttype & operator()(const int row, const int col) const + { return host[col]; } + + /// Returns pointer to memory pointer for allocation on host + inline hosttype ** host_ptr() { return host.host_ptr(); } + + /// Return the default command queue/stream associated with this data + inline command_queue & cq() { return host.cq(); } + /// Block until command_queue associated with matrix is complete + inline void sync() { host.sync(); } + + ///Get the size of a row on the host (including any padding) in elements + inline size_t row_size() const { return host.row_size(); } + /// Get the size of a row on the host(including any padding) in bytes + inline size_t row_bytes() const { return host.row_bytes(); } + /// Get the size on the host in bytes of 1 element + inline int element_size() const { return sizeof(hosttype); } + + + /// Update the allocation on the host asynchronously + inline void update_host() + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(host,device,_buffer,true); } + /// Update the allocation on the host (true for asynchronous copy) + inline void update_host(const bool async) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(host,device,_buffer,async); } + /// Update the allocation on the host (using command queue) + inline void update_host(command_queue &cq) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(host,device,_buffer,cq); } + /// Update the first n elements on the host (true for asynchronous copy) + inline void update_host(const int n, const bool async) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(host,device,n,_buffer,async); } + /// Update the first n elements on the host (using command queue) + inline void update_host(const int n, command_queue &cq) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(host,device,n,_buffer,cq); } + /// Update slice on the host (true for asynchronous copy) + inline void update_host(const int rows, const int cols, const bool async) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(host,device,rows,cols,_buffer,async); } + /// Update slice on the host (using command queue) + inline void update_host(const int rows, const int cols, command_queue &cq) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(host,device,rows,cols,_buffer,cq); } + + + /// Update the allocation on the device asynchronously + inline void update_device() + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(device,host,_buffer,true); } + /// Update the allocation on the device (true for asynchronous copy) + inline void update_device(const bool async) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(device,host,_buffer,async); } + /// Update the allocation on the device (using command queue) + inline void update_device(command_queue &cq) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(device,host,_buffer,cq); } + /// Update the first n elements on the device (true for asynchronous copy) + inline void update_device(const int n, const bool async) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(device,host,n,_buffer,async); } + /// Update the first n elements on the device (using command queue) + inline void update_device(const int n, command_queue &cq) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(device,host,n,_buffer,cq); } + /// Update slice on the device (true for asynchronous copy) + inline void update_device(const int rows, const int cols, const bool async) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(device,host,rows,cols,_buffer,async); } + /// Update slice on the device (using command queue) + inline void update_device(const int rows, const int cols, command_queue &cq) + { _ucl_s_obj_help< ucl_same_type::ans >:: + copy(device,host,rows,cols,_buffer,cq); } + + private: + UCL_H_Vec _buffer; +}; + +#endif + diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp new file mode 100644 index 0000000000..7c090e6351 --- /dev/null +++ b/lib/gpu/lal_base_dipole.cpp @@ -0,0 +1,315 @@ +/*************************************************************************** + base_dipole.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Base class for pair styles needing per-particle data for position, + dipole, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#include "lal_base_dipole.h" +using namespace LAMMPS_AL; +#define BaseDipoleT BaseDipole + +extern Device global_device; + +template +BaseDipoleT::BaseDipole() : _compiled(false), _max_bytes(0) { + device=&global_device; + ans=new Answer(); + nbor=new Neighbor(); +} + +template +BaseDipoleT::~BaseDipole() { + delete ans; + delete nbor; +} + +template +int BaseDipoleT::bytes_per_atom_atomic(const int max_nbors) const { + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); +} + +template +int BaseDipoleT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen, + const void *pair_program, + const char *k_name) { + screen=_screen; + + int gpu_nbor=0; + if (device->gpu_mode()==Device::GPU_NEIGH) + gpu_nbor=1; + else if (device->gpu_mode()==Device::GPU_HYB_NEIGH) + gpu_nbor=2; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); + if (host_nlocal>0) + _gpu_host=1; + + _threads_per_atom=device->threads_per_charge(); + if (_threads_per_atom>1 && gpu_nbor==0) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor, + maxspecial,_gpu_host,max_nbors,cell_size,false, + _threads_per_atom); + if (success!=0) + return success; + + ucl_device=device->gpu; + atom=&device->atom; + + _block_size=device->pair_block_size(); + _block_bio_size=device->block_bio_pair(); + compile_kernels(*ucl_device,pair_program,k_name); + + // Initialize host-device load balancer + hd_balancer.init(device,gpu_nbor,gpu_split); + + // Initialize timers for the selected GPU + time_pair.init(*ucl_device); + time_pair.zero(); + + pos_tex.bind_float(atom->x,4); + q_tex.bind_float(atom->q,1); + mu_tex.bind_float(atom->quat,4); + + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + + return success; +} + +template +void BaseDipoleT::estimate_gpu_overhead() { + device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); +} + +template +void BaseDipoleT::clear_atomic() { + // Output any timing information + acc_timers(); + double avg_split=hd_balancer.all_avg_split(); + _gpu_overhead*=hd_balancer.timestep(); + _driver_overhead*=hd_balancer.timestep(); + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); + + if (_compiled) { + k_pair_fast.clear(); + k_pair.clear(); + delete pair_program; + _compiled=false; + } + + time_pair.clear(); + hd_balancer.clear(); + + nbor->clear(); + ans->clear(); + device->clear(); +} + +// --------------------------------------------------------------------------- +// Copy neighbor list from host +// --------------------------------------------------------------------------- +template +int * BaseDipoleT::reset_nbors(const int nall, const int inum, int *ilist, + int *numj, int **firstneigh, bool &success) { + success=true; + + int mn=nbor->max_nbor_loop(inum,numj,ilist); + resize_atom(inum,nall,success); + resize_local(inum,mn,success); + if (!success) + return false; + + nbor->get_host(inum,ilist,numj,firstneigh,block_size()); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + + return ilist; +} + +// --------------------------------------------------------------------------- +// Build neighbor list on device +// --------------------------------------------------------------------------- +template +inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, int *tag, + int **nspecial, int **special, + bool &success) { + success=true; + resize_atom(inum,nall,success); + resize_local(inum,host_inum,nbor->max_nbors(),success); + if (!success) + return; + atom->cast_copy_x(host_x,host_type); + + int mn; + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, + nspecial, special, success, mn); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; +} + +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then calculate forces, virials,.. +// --------------------------------------------------------------------------- +template +void BaseDipoleT::compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, double *host_q, double **host_mu, + const int nlocal, double *boxlo, double *prd) { + acc_timers(); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return; + } + + int ago=hd_balancer.ago_first(f_ago); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); + ans->inum(inum); + host_start=inum; + + if (ago==0) { + reset_nbors(nall, inum, ilist, numj, firstneigh, success); + if (!success) + return; + } + + atom->cast_x_data(host_x,host_type); + atom->cast_q_data(host_q); + atom->cast_quat_data(host_mu[0]); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + atom->add_q_data(); + atom->add_quat_data(); + + device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q, + boxlo, prd); + + loop(eflag,vflag); + ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + device->add_ans_object(ans); + hd_balancer.stop_timer(); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary and then compute forces, virials, energies +// --------------------------------------------------------------------------- +template +int** BaseDipoleT::compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, + int **nspecial, int **special, const bool eflag, + const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success, + double *host_q, double **host_mu, + double *boxlo, double *prd) { + acc_timers(); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return NULL; + } + + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, success); + if (!success) + return NULL; + atom->cast_q_data(host_q); + atom->cast_quat_data(host_mu[0]); + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + atom->cast_q_data(host_q); + atom->cast_quat_data(host_mu[0]); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + atom->add_q_data(); + atom->add_quat_data(); + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, + boxlo, prd); + + loop(eflag,vflag); + ans->copy_answers(eflag,vflag,eatom,vatom); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + + return nbor->host_jlist.begin()-host_start; +} + +template +double BaseDipoleT::host_memory_usage_atomic() const { + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(BaseDipole); +} + +template +void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str, + const char *kname) { + if (_compiled) + return; + + std::string s_fast=std::string(kname)+"_fast"; + std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+ + std::string(OCL_PRECISION_COMPILE)+" -D"+ + std::string(OCL_VENDOR); + + pair_program=new UCL_Program(dev); + pair_program->load_string(pair_str,flags.c_str()); + k_pair_fast.set_function(*pair_program,s_fast.c_str()); + k_pair.set_function(*pair_program,kname); + pos_tex.get_texture(*pair_program,"pos_tex"); + q_tex.get_texture(*pair_program,"q_tex"); + mu_tex.get_texture(*pair_program,"mu_tex"); + + _compiled=true; +} + +template class BaseDipole; + diff --git a/lib/gpu/lal_base_dipole.h b/lib/gpu/lal_base_dipole.h new file mode 100644 index 0000000000..51e357afda --- /dev/null +++ b/lib/gpu/lal_base_dipole.h @@ -0,0 +1,200 @@ +/*************************************************************************** + base_dipole.h + ------------------- + Trung Dac Nguyen (ORNL) + + Base class for pair styles needing per-particle data for position, + dipole, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifndef LAL_BASE_DIPOLE_H +#define LAL_BASE_DIPOLE_H + +#include "lal_device.h" +#include "lal_balance.h" +#include "mpi.h" + +#ifdef USE_OPENCL +#include "geryon/ocl_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif + +namespace LAMMPS_AL { + +template +class BaseDipole { + public: + BaseDipole(); + virtual ~BaseDipole(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * \param k_name name for the kernel for force calculation + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const void *pair_program, const char *k_name); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int inum, const int nall, bool &success) { + if (atom->resize(nall, success)) { + pos_tex.bind_float(atom->x,4); + q_tex.bind_float(atom->q,1); + mu_tex.bind_float(atom->quat,4); + } + ans->resize(inum,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note olist_size=total number of local particles **/ + inline void resize_local(const int inum, const int max_nbors, bool &success) { + nbor->resize(inum,max_nbors,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note host_inum is 0 if the host is performing neighboring + * \note nlocal+host_inum=total number local particles + * \note olist_size=0 **/ + inline void resize_local(const int inum, const int host_inum, + const int max_nbors, bool &success) { + nbor->resize(inum,host_inum,max_nbors,success); + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear_atomic(); + + /// Returns memory usage on device per atom + int bytes_per_atom_atomic(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage_atomic() const; + + /// Accumulate timers + inline void acc_timers() { + if (device->time_device()) { + nbor->acc_timers(); + time_pair.add_to_total(); + atom->acc_timers(); + ans->acc_timers(); + } + } + + /// Zero timers + inline void zero_timers() { + time_pair.zero(); + atom->zero_timers(); + ans->zero_timers(); + } + + /// Copy neighbor list from host + int * reset_nbors(const int nall, const int inum, int *ilist, int *numj, + int **firstneigh, bool &success); + + /// Build neighbor list on device + void build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, bool &success); + + /// Pair loop with host neighboring + void compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *charge, + double **mu, const int nlocal, double *boxlo, double *prd); + + /// Pair loop with device neighboring + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double *charge, double **mu, double *boxlo, double *prd); + + // -------------------------- DEVICE DATA ------------------------- + + /// Device Properties and Atom and Neighbor storage + Device *device; + + /// Geryon device + UCL_Device *ucl_device; + + /// Device Timers + UCL_Timer time_pair; + + /// Host device load balancer + Balance hd_balancer; + + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + Atom *atom; + + + // ------------------------ FORCE/ENERGY DATA ----------------------- + + Answer *ans; + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor data + Neighbor *nbor; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *pair_program; + UCL_Kernel k_pair_fast, k_pair; + inline int block_size() { return _block_size; } + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + UCL_Texture q_tex; + UCL_Texture mu_tex; + + protected: + bool _compiled; + int _block_size, _block_bio_size, _threads_per_atom; + double _max_bytes, _max_an_bytes; + double _gpu_overhead, _driver_overhead; + UCL_D_Vec *_nbor_data; + + void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); + + virtual void loop(const bool _eflag, const bool _vflag) = 0; +}; + +} + +#endif diff --git a/lib/gpu/lal_born.cpp b/lib/gpu/lal_born.cpp new file mode 100644 index 0000000000..1665717c2d --- /dev/null +++ b/lib/gpu/lal_born.cpp @@ -0,0 +1,162 @@ +/*************************************************************************** + born.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the born pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "born_cl.h" +#elif defined(USE_CUDART) +const char *born=0; +#else +#include "born_cubin.h" +#endif + +#include "lal_born.h" +#include +using namespace LAMMPS_AL; +#define BornT Born + +extern Device device; + +template +BornT::Born() : BaseAtomic(), _allocated(false) { +} + +template +BornT::~Born() { + clear(); +} + +template +int BornT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int BornT::init(const int ntypes, double **host_cutsq, + double **host_rhoinv, double **host_born1, double **host_born2, + double **host_born3, double **host_a, double **host_c, + double **host_d, double **host_sigma, + double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,born,"k_born"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv, + host_born1,host_born2,host_born3); + + coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, + host_d,host_offset); + + cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack2(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq, + host_sigma); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + _allocated=true; + this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes() + +cutsq_sigma.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void BornT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff1.clear(); + coeff2.clear(); + cutsq_sigma.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double BornT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(Born); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void BornT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->x, &coeff1,&coeff2, + &cutsq_sigma, &sp_lj, + &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &coeff1, &coeff2, + &cutsq_sigma, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, + &nbor_pitch, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class Born; diff --git a/lib/gpu/lal_born.cu b/lib/gpu/lal_born.cu new file mode 100644 index 0000000000..3410a5effd --- /dev/null +++ b/lib/gpu/lal_born.cu @@ -0,0 +1,201 @@ +// ************************************************************************** +// born.cu +// ------------------- +// Trung Dac Nguyen (ORNL) +// +// Device code for acceleration of the born pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : nguyentd@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +texture pos_tex; +#else +texture pos_tex; +#endif +#else +#define pos_tex x_ +#endif + +__kernel void k_born(__global numtyp4 *x_, __global numtyp4 *coeff1, + __global numtyp4* coeff2, + __global numtyp2 *cutsq_sigma, + const int lj_types, __global numtyp *sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[4]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii0) { + numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + + coeff2[mtype].z*r2inv*r6inv; + energy+=factor_lj*(e-coeff2[mtype].w); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + +__kernel void k_born_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in, + __global numtyp4* coeff2_in, + __global numtyp2 *cutsq_sigma, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + if (tid<4) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) + coeff2[tid]=coeff2_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii0) { + numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + + coeff2[mtype].z*r2inv*r6inv; + energy+=factor_lj*(e-coeff2[mtype].w); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_born.h b/lib/gpu/lal_born.h new file mode 100644 index 0000000000..fe9c2a1819 --- /dev/null +++ b/lib/gpu/lal_born.h @@ -0,0 +1,84 @@ +/*************************************************************************** + born.h + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the born pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifndef LAL_BORN_H +#define LAL_BORN_H + +#include "lal_base_atomic.h" + +namespace LAMMPS_AL { + +template +class Born : public BaseAtomic { + public: + Born(); + ~Born(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_rhoinv, double **host_born1, double **host_born2, + double **host_born3, double **host_a, double **host_c, + double **host_d, double **host_sigma, + double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, + /// coeff1.w = born3 + UCL_D_Vec coeff1; + /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset + UCL_D_Vec coeff2; + /// cutsq_sigma + UCL_D_Vec cutsq_sigma; + /// Special LJ values + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_born_coul_long.cpp b/lib/gpu/lal_born_coul_long.cpp new file mode 100644 index 0000000000..157923d4fa --- /dev/null +++ b/lib/gpu/lal_born_coul_long.cpp @@ -0,0 +1,175 @@ +/*************************************************************************** + born_coul_long.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the born/coul/long pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "born_coul_long_cl.h" +#elif defined(USE_CUDART) +const char *born_coul_long=0; +#else +#include "born_coul_long_cubin.h" +#endif + +#include "lal_born_coul_long.h" +#include +using namespace LAMMPS_AL; +#define BornCoulLongT BornCoulLong + +extern Device device; + +template +BornCoulLongT::BornCoulLong() : BaseCharge(), + _allocated(false) { +} + +template +BornCoulLongT::~BornCoulLongT() { + clear(); +} + +template +int BornCoulLongT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, + double **host_born1, double **host_born2, double **host_born3, + double **host_a, double **host_c, double **host_d, + double **host_sigma, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,born_coul_long,"k_born_long"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv, + host_born1,host_born2,host_born3); + + coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, + host_d,host_offset); + + cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq, + host_cut_ljsq,host_sigma); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _cut_coulsq=host_cut_coulsq; + _qqrd2e=qqrd2e; + _g_ewald=g_ewald; + + _allocated=true; + this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes() + +cutsq_sigma.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void BornCoulLongT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff1.clear(); + coeff2.clear(); + cutsq_sigma.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double BornCoulLongT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(BornCoulLong); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void BornCoulLongT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, + &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->ans->force, + &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->q, + &cutsq_sigma, &_cut_coulsq, &_qqrd2e, + &_g_ewald, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->q, + &cutsq_sigma, &_cut_coulsq, + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class BornCoulLong; diff --git a/lib/gpu/lal_born_coul_long.cu b/lib/gpu/lal_born_coul_long.cu new file mode 100644 index 0000000000..601f7341cf --- /dev/null +++ b/lib/gpu/lal_born_coul_long.cu @@ -0,0 +1,262 @@ +// ************************************************************************** +// buck_coul_long.cu +// ------------------- +// Trung Dac Nguyen (ORNL) +// +// Device code for acceleration of the buck/coul/long pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : nguyentd@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL + +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +texture pos_tex; +texture q_tex; +#else +texture pos_tex; +texture q_tex; +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_born_long(__global numtyp4 *x_, __global numtyp4 *coeff1, + __global numtyp4* coeff2, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + __global numtyp4 *cutsq_sigma, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii0) { + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < coeff1[mtype].w) { + numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + + coeff2[mtype].z*r2inv*r6inv; + energy+=factor_lj*(e-coeff2[mtype].w); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + +__kernel void k_born_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in, + __global numtyp4* coeff2_in, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + __global numtyp4 *cutsq_sigma, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) + coeff2[tid]=coeff2_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii0) { + if (rsq < cut_coulsq) + e_coul += prefactor*(_erfc-factor_coul); + if (rsq < coeff1[mtype].w) { + numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + + coeff2[mtype].z*r2inv*r6inv; + energy+=factor_lj*(e-coeff2[mtype].w); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_born_coul_long.h b/lib/gpu/lal_born_coul_long.h new file mode 100644 index 0000000000..4dc5021f03 --- /dev/null +++ b/lib/gpu/lal_born_coul_long.h @@ -0,0 +1,88 @@ +/*************************************************************************** + born_coul_long.h + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the born/coul/long pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifndef LAL_BORN_COUL_LONG_H +#define LAL_BORN_COUL_LONG_H + +#include "lal_base_charge.h" + +namespace LAMMPS_AL { + +template +class BornCoulLong : public BaseCharge { + public: + BornCoulLong(); + ~BornCoulLong(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_rhoinv, + double **host_born1, double **host_born2, double **host_born3, + double **host_a, double **host_c, double **host_d, + double **host_sigma, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, + /// coeff1.w = born3 + UCL_D_Vec coeff1; + /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset + UCL_D_Vec coeff2; + /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, + /// cutsq_sigma.z = sigma + UCL_D_Vec cutsq_sigma; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut_coulsq, _qqrd2e, _g_ewald; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_born_coul_long_ext.cpp b/lib/gpu/lal_born_coul_long_ext.cpp new file mode 100644 index 0000000000..e8ac4eff0b --- /dev/null +++ b/lib/gpu/lal_born_coul_long_ext.cpp @@ -0,0 +1,132 @@ +/*************************************************************************** + born_coul_long_ext.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Functions for LAMMPS access to born/coul/long acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "lal_born_coul_long.h" + +using namespace std; +using namespace LAMMPS_AL; + +static BornCoulLong BORNCLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, + double **host_born1, double **host_born2, double **host_born3, + double **host_a, double **host_c, double **host_d, + double **sigma, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + BORNCLMF.clear(); + gpu_mode=BORNCLMF.device->gpu_mode(); + double gpu_split=BORNCLMF.device->particle_split(); + int first_gpu=BORNCLMF.device->first_device(); + int last_gpu=BORNCLMF.device->last_device(); + int world_me=BORNCLMF.device->world_me(); + int gpu_rank=BORNCLMF.device->gpu_rank(); + int procs_per_gpu=BORNCLMF.device->procs_per_gpu(); + + BORNCLMF.device->init_message(screen,"born/coul/long",first_gpu,last_gpu); + + bool message=false; + if (BORNCLMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, + host_born3, host_a, host_c, host_d, sigma, offset, + special_lj, inum, nall, 300, maxspecial, cell_size, + gpu_split, screen, host_cut_ljsq, host_cut_coulsq, + host_special_coul, qqrd2e, g_ewald); + + BORNCLMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + BORNCLMF.estimate_gpu_overhead(); + return init_ok; +} + +void borncl_gpu_clear() { + BORNCLMF.clear(); +} + +int** borncl_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return BORNCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void borncl_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + BORNCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q,nlocal,boxlo,prd); +} + +double borncl_gpu_bytes() { + return BORNCLMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_born_coul_wolf.cpp b/lib/gpu/lal_born_coul_wolf.cpp new file mode 100644 index 0000000000..936ce7095b --- /dev/null +++ b/lib/gpu/lal_born_coul_wolf.cpp @@ -0,0 +1,176 @@ +/*************************************************************************** + born_coul_wolf.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the born/coul/wolf pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "born_coul_wolf_cl.h" +#elif defined(USE_CUDART) +const char *born_coul_wolf=0; +#else +#include "born_coul_wolf_cubin.h" +#endif + +#include "lal_born_coul_wolf.h" +#include +using namespace LAMMPS_AL; +#define BornCoulWolfT BornCoulWolf + +extern Device device; + +template +BornCoulWolfT::BornCoulWolf() : BaseCharge(), + _allocated(false) { +} + +template +BornCoulWolfT::~BornCoulWolfT() { + clear(); +} + +template +int BornCoulWolfT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rhoinv, + double **host_born1, double **host_born2, double **host_born3, + double **host_a, double **host_c, double **host_d, + double **host_sigma, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double alf, const double e_shift, const double f_shift) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,born_coul_wolf,"k_born_wolf"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_rhoinv, + host_born1,host_born2,host_born3); + + coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c, + host_d,host_offset); + + cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq, + host_cut_ljsq,host_sigma); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _cut_coulsq=host_cut_coulsq; + _qqrd2e=qqrd2e; + _alf=alf; + _e_shift=e_shift; + _f_shift=f_shift; + + _allocated=true; + this->_max_bytes=coeff1.row_bytes()+coeff2.row_bytes() + +cutsq_sigma.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void BornCoulWolfT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff1.clear(); + coeff2.clear(); + cutsq_sigma.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double BornCoulWolfT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(BornCoulWolf); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->q, + &cutsq_sigma, &_cut_coulsq, &_qqrd2e, + &_alf, &_e_shift, &_f_shift, + &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &coeff1, &coeff2, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->q, + &cutsq_sigma, &_cut_coulsq, + &_qqrd2e, &_alf, &_e_shift, &_f_shift, + &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class BornCoulWolf; diff --git a/lib/gpu/lal_born_coul_wolf.cu b/lib/gpu/lal_born_coul_wolf.cu new file mode 100644 index 0000000000..7461569db5 --- /dev/null +++ b/lib/gpu/lal_born_coul_wolf.cu @@ -0,0 +1,282 @@ +// ************************************************************************** +// born_coul_wolf.cu +// ------------------- +// Trung Dac Nguyen (ORNL) +// +// Device code for acceleration of the born/coul/wolf pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : nguyentd@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL + +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +texture pos_tex; +texture q_tex; +#else +texture pos_tex; +texture q_tex; +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +#define MY_PIS (acctyp)1.77245385090551602729 + +__kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1, + __global numtyp4* coeff2, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + __global numtyp4 *cutsq_sigma, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp alf, const numtyp e_shift, + const numtyp f_shift, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii0) { + acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; + e_coul += (acctyp)2.0*e_self; + } + + for ( ; nbor0) { + if (rsq < cut_coulsq) + e_coul += prefactor*(v_sh-factor_coul); + if (rsq < coeff1[mtype].w) { + numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + + coeff2[mtype].z*r2inv*r6inv; + energy+=factor_lj*(e-coeff2[mtype].w); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + +__kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in, + __global numtyp4* coeff2_in, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + __global numtyp4 *cutsq_sigma, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp alf, const numtyp e_shift, + const numtyp f_shift, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) + coeff2[tid]=coeff2_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii0) { + acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; + e_coul += (acctyp)2.0*e_self; + } + + for ( ; nbor0) { + if (rsq < cut_coulsq) + e_coul += prefactor*(v_sh-factor_coul); + if (rsq < coeff1[mtype].w) { + numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + + coeff2[mtype].z*r2inv*r6inv; + energy+=factor_lj*(e-coeff2[mtype].w); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_born_coul_wolf.h b/lib/gpu/lal_born_coul_wolf.h new file mode 100644 index 0000000000..9e02d23233 --- /dev/null +++ b/lib/gpu/lal_born_coul_wolf.h @@ -0,0 +1,89 @@ +/*************************************************************************** + born_coul_wolf.h + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the born/coul/wolf pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifndef LAL_BORN_COUL_LONG_H +#define LAL_BORN_COUL_LONG_H + +#include "lal_base_charge.h" + +namespace LAMMPS_AL { + +template +class BornCoulWolf : public BaseCharge { + public: + BornCoulWolf(); + ~BornCoulWolf(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_rhoinv, + double **host_born1, double **host_born2, double **host_born3, + double **host_a, double **host_c, double **host_d, + double **host_sigma, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double alf, const double e_shift, + const double f_shift); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// coeff1.x = rhoinv, coeff1.y = born1, coeff1.z = born2, + /// coeff1.w = born3 + UCL_D_Vec coeff1; + /// coeff2.x = a, coeff2.y = c, coeff2.z = d, coeff2.w = offset + UCL_D_Vec coeff2; + /// cutsq_sigma.x = cutsq, cutsq_sigma.y = cutsq_lj, + /// cutsq_sigma.z = sigma + UCL_D_Vec cutsq_sigma; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut_coulsq,_qqrd2e,_alf,_e_shift,_f_shift; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_born_coul_wolf_ext.cpp b/lib/gpu/lal_born_coul_wolf_ext.cpp new file mode 100644 index 0000000000..3e779d099e --- /dev/null +++ b/lib/gpu/lal_born_coul_wolf_ext.cpp @@ -0,0 +1,134 @@ +/*************************************************************************** + born_coul_wolf_ext.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Functions for LAMMPS access to born/coul/wolf acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "lal_born_coul_wolf.h" + +using namespace std; +using namespace LAMMPS_AL; + +static BornCoulWolf BORNCWMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, + double **host_born1, double **host_born2, double **host_born3, + double **host_a, double **host_c, double **host_d, + double **sigma, double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double alf, const double e_shift, const double f_shift) { + BORNCWMF.clear(); + gpu_mode=BORNCWMF.device->gpu_mode(); + double gpu_split=BORNCWMF.device->particle_split(); + int first_gpu=BORNCWMF.device->first_device(); + int last_gpu=BORNCWMF.device->last_device(); + int world_me=BORNCWMF.device->world_me(); + int gpu_rank=BORNCWMF.device->gpu_rank(); + int procs_per_gpu=BORNCWMF.device->procs_per_gpu(); + + BORNCWMF.device->init_message(screen,"born/coul/wolf",first_gpu,last_gpu); + + bool message=false; + if (BORNCWMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, + host_born3, host_a, host_c, host_d, sigma, + offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e, + alf, e_shift, f_shift); + + BORNCWMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + BORNCWMF.estimate_gpu_overhead(); + return init_ok; +} + +void borncw_gpu_clear() { + BORNCWMF.clear(); +} + +int** borncw_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return BORNCWMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void borncw_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + BORNCWMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q,nlocal,boxlo,prd); +} + +double borncw_gpu_bytes() { + return BORNCWMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_born_ext.cpp b/lib/gpu/lal_born_ext.cpp new file mode 100644 index 0000000000..7785353a8a --- /dev/null +++ b/lib/gpu/lal_born_ext.cpp @@ -0,0 +1,124 @@ +/*************************************************************************** + born_ext.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Functions for LAMMPS access to born acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "lal_born.h" + +using namespace std; +using namespace LAMMPS_AL; + +static Born BORNMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, + double **host_born1, double **host_born2, + double **host_born3, double **host_a, double **host_c, + double **host_d, double **sigma, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + BORNMF.clear(); + gpu_mode=BORNMF.device->gpu_mode(); + double gpu_split=BORNMF.device->particle_split(); + int first_gpu=BORNMF.device->first_device(); + int last_gpu=BORNMF.device->last_device(); + int world_me=BORNMF.device->world_me(); + int gpu_rank=BORNMF.device->gpu_rank(); + int procs_per_gpu=BORNMF.device->procs_per_gpu(); + + BORNMF.device->init_message(screen,"born",first_gpu,last_gpu); + + bool message=false; + if (BORNMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, + host_born3, host_a, host_c, host_d, sigma, + offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); + + BORNMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + BORNMF.estimate_gpu_overhead(); + return init_ok; +} + +void born_gpu_clear() { + BORNMF.clear(); +} + +int ** born_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success) { + return BORNMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success); +} + +void born_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + BORNMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); +} + +double born_gpu_bytes() { + return BORNMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_colloid.cpp b/lib/gpu/lal_colloid.cpp new file mode 100644 index 0000000000..cb6d23fc31 --- /dev/null +++ b/lib/gpu/lal_colloid.cpp @@ -0,0 +1,181 @@ +/*************************************************************************** + colloid.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the colloid pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "colloid_cl.h" +#elif defined(USE_CUDART) +const char *colloid=0; +#else +#include "colloid_cubin.h" +#endif + +#include "lal_colloid.h" +#include +using namespace LAMMPS_AL; +#define ColloidT Colloid + +extern Device device; + +template +ColloidT::Colloid() : BaseAtomic(), _allocated(false) { +} + +template +ColloidT::~Colloid() { + clear(); +} + +template +int ColloidT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int ColloidT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, double **host_a12, + double **host_a1, double **host_a2, + double **host_d1, double **host_d2, + double **host_sigma3, double **host_sigma6, + int **host_form, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,colloid,"k_colloid"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + colloid1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,colloid1,host_write,host_a12,host_a1, + host_a2); + + colloid2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,colloid2,host_write,host_d1,host_d2, + host_sigma3,host_sigma6); + + UCL_H_Vec dview_form(lj_types*lj_types,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + for (int i=0; iucl_device),UCL_READ_ONLY); + for (int i=0; i dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes() + +colloid1.row_bytes()+colloid2.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void ColloidT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + colloid1.clear(); + colloid2.clear(); + form.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double ColloidT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(Colloid); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void ColloidT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + &colloid1, &colloid2, &form, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &colloid1, &colloid2, &form, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class Colloid; diff --git a/lib/gpu/lal_colloid.cu b/lib/gpu/lal_colloid.cu new file mode 100644 index 0000000000..74e249083d --- /dev/null +++ b/lib/gpu/lal_colloid.cu @@ -0,0 +1,329 @@ +// ************************************************************************** +// colloid.cu +// ------------------- +// Trung Dac Nguyen (ORNL) +// +// Device code for acceleration of the colloid pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : nguyentd@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +texture pos_tex; +#else +texture pos_tex; +#endif +#else +#define pos_tex x_ +#endif + +__kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, + __global numtyp4* colloid1, + __global numtyp4* colloid2, + __global int *form, + __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[4]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii0) { + numtyp e=(numtyp)0.0; + if (form[mtype]==0) { + e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + } else if (form[mtype]==1) { + e=(numtyp)2.0/(numtyp)9.0*fR * + ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) + + (numtyp)4.2*K[4])+K[2]*K[4]) * colloid2[mtype].w/K[6]); + } else if (form[mtype]==2) { + e=evdwl+colloid1[mtype].x/(numtyp)6.0 * ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7])); + } + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + +__kernel void k_colloid_fast(__global numtyp4 *x_, + __global numtyp4 *lj1_in, + __global numtyp4 *lj3_in, + __global numtyp *sp_lj_in, + __global numtyp4 *colloid1_in, + __global numtyp4 *colloid2_in, + __global int *form_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 colloid1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 colloid2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + if (tid<4) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii0) { + numtyp e=(numtyp)0.0; + if (form[mtype]==0) { + e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + } else if (form[mtype]==1) { + e=(numtyp)2.0/(numtyp)9.0*fR * + ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])* + colloid2[mtype].w/K[6]); + } else if (form[mtype]==2) { + e=evdwl+colloid1[mtype].x/(numtyp)6.0 * ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7])); + } + energy+=factor_lj*(e-lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_colloid.h b/lib/gpu/lal_colloid.h new file mode 100644 index 0000000000..416beabcdf --- /dev/null +++ b/lib/gpu/lal_colloid.h @@ -0,0 +1,89 @@ +/*************************************************************************** + colloid.h + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the colloid pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifndef LAL_COLLOID_H +#define LAL_COLLOID_H + +#include "lal_base_atomic.h" + +namespace LAMMPS_AL { + +template +class Colloid : public BaseAtomic { + public: + Colloid(); + ~Colloid(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + double **host_a12, double **host_a1, double **host_a2, + double **host_d1, double **host_d2, double **host_sigma3, + double **host_sigma6, int **host_form, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec lj3; + /// colloid1.x = a12, colloid1.y = a1, colloid1.z = a2 + UCL_D_Vec colloid1; + /// colloid2.x = d1, colloid2.y = d2, colloid2.z = sigma3, + /// colloid2.w = sigma6 + UCL_D_Vec colloid2; + /// form + UCL_D_Vec form; + /// Special LJ values + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_colloid_ext.cpp b/lib/gpu/lal_colloid_ext.cpp new file mode 100644 index 0000000000..d4e8a2092b --- /dev/null +++ b/lib/gpu/lal_colloid_ext.cpp @@ -0,0 +1,127 @@ +/*************************************************************************** + colloid_ext.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Functions for LAMMPS access to colloid acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "lal_colloid.h" + +using namespace std; +using namespace LAMMPS_AL; + +static Colloid COLLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, + double **host_a12, double **host_a1, double **host_a2, + double **host_d1, double **host_d2, double **host_sigma3, + double **host_sigma6, int **host_form, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + COLLMF.clear(); + gpu_mode=COLLMF.device->gpu_mode(); + double gpu_split=COLLMF.device->particle_split(); + int first_gpu=COLLMF.device->first_device(); + int last_gpu=COLLMF.device->last_device(); + int world_me=COLLMF.device->world_me(); + int gpu_rank=COLLMF.device->gpu_rank(); + int procs_per_gpu=COLLMF.device->procs_per_gpu(); + + COLLMF.device->init_message(screen,"colloid",first_gpu,last_gpu); + + bool message=false; + if (COLLMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, host_a12, host_a1, + host_a2, host_d1, host_d2, host_sigma3, + host_sigma6, host_form, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); + + COLLMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + COLLMF.estimate_gpu_overhead(); + return init_ok; +} + +void colloid_gpu_clear() { + COLLMF.clear(); +} + +int ** colloid_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success) { + return COLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success); +} + +void colloid_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + COLLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); +} + +double colloid_gpu_bytes() { + return COLLMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_coul_dsf.cpp b/lib/gpu/lal_coul_dsf.cpp new file mode 100644 index 0000000000..e597a325ff --- /dev/null +++ b/lib/gpu/lal_coul_dsf.cpp @@ -0,0 +1,153 @@ +/*************************************************************************** + coul_dsf.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the coul/dsf pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : 8/15/2012 + email : nguyentdw@ornl.gov + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "coul_dsf_cl.h" +#elif defined(USE_CUDART) +const char *coul_dsf=0; +#else +#include "coul_dsf_cubin.h" +#endif + +#include "lal_coul_dsf.h" +#include +using namespace LAMMPS_AL; +#define CoulDSFT CoulDSF + +extern Device device; + +template +CoulDSFT::CoulDSF() : BaseCharge(), + _allocated(false) { +} + +template +CoulDSFT::~CoulDSF() { + clear(); +} + +template +int CoulDSFT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int CoulDSFT::init(const int ntypes, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, FILE *_screen, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double e_shift, const double f_shift, + const double alpha) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,coul_dsf,"k_coul_dsf"); + if (success!=0) + return success; + + _cut_coulsq=host_cut_coulsq; + _e_shift=e_shift; + _f_shift=f_shift; + _alpha=alpha; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,4,false); + + _qqrd2e=qqrd2e; + + _allocated=true; + this->_max_bytes=sp_lj.row_bytes(); + return 0; +} + +template +void CoulDSFT::clear() { + if (!_allocated) + return; + _allocated=false; + + sp_lj.clear(); + this->clear_atomic(); +} + +template +double CoulDSFT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(CoulDSF); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void CoulDSFT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->x, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, + &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha, + &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, + &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha, + &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class CoulDSF; diff --git a/lib/gpu/lal_coul_dsf.cu b/lib/gpu/lal_coul_dsf.cu new file mode 100644 index 0000000000..3ce1eccab3 --- /dev/null +++ b/lib/gpu/lal_coul_dsf.cu @@ -0,0 +1,214 @@ +// ************************************************************************** +// coul_dsf.cu +// ------------------- +// Trung Dac Nguyen (ORNL) +// +// Device code for acceleration of the coul/dsf pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : 8/15/2012 +// email : nguyentd@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL + +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +texture pos_tex; +texture q_tex; +#else +texture pos_tex; +texture q_tex; +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +#define MY_PIS (acctyp)1.77245385090551602729 + +__kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_ , + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp e_shift, const numtyp f_shift, + const numtyp alpha, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[4]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii0) { + if (rsq < cut_coulsq) { + numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift); + e_coul += e; + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + +__kernel void k_coul_dsf_fast(__global numtyp4 *x_, __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp e_shift, const numtyp f_shift, + const numtyp alpha, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[4]; + if (tid<4) + sp_lj[tid]=sp_lj_in[tid]; + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii0) { + if (rsq < cut_coulsq) { + numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift); + e_coul += e; + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_coul_dsf.h b/lib/gpu/lal_coul_dsf.h new file mode 100644 index 0000000000..0579ab46d5 --- /dev/null +++ b/lib/gpu/lal_coul_dsf.h @@ -0,0 +1,78 @@ +/*************************************************************************** + coul_dsf.h + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the coul/dsf pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : 8/15/2012 + email : nguyentdw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_LJ_DSF_H +#define LAL_LJ_DSF_H + +#include "lal_base_charge.h" + +namespace LAMMPS_AL { + +template +class CoulDSF : public BaseCharge { + public: + CoulDSF(); + ~CoulDSF(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, FILE *screen, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double e_shift, const double f_shift, + const double alpha); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _qqrd2e; + + private: + bool _allocated; + numtyp _e_shift, _f_shift, _alpha, _cut_coulsq; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_coul_dsf_ext.cpp b/lib/gpu/lal_coul_dsf_ext.cpp new file mode 100644 index 0000000000..19879cb9c5 --- /dev/null +++ b/lib/gpu/lal_coul_dsf_ext.cpp @@ -0,0 +1,125 @@ +/*************************************************************************** + coul_dsf_ext.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Functions for LAMMPS access to coul/dsf acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : 8/15/2012 + email : nguyentd@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "lal_coul_dsf.h" + +using namespace std; +using namespace LAMMPS_AL; + +static CoulDSF CDMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int cdsf_gpu_init(const int ntypes, const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double e_shift, const double f_shift, + const double alpha) { + CDMF.clear(); + gpu_mode=CDMF.device->gpu_mode(); + double gpu_split=CDMF.device->particle_split(); + int first_gpu=CDMF.device->first_device(); + int last_gpu=CDMF.device->last_device(); + int world_me=CDMF.device->world_me(); + int gpu_rank=CDMF.device->gpu_rank(); + int procs_per_gpu=CDMF.device->procs_per_gpu(); + + CDMF.device->init_message(screen,"coul/dsf",first_gpu,last_gpu); + + bool message=false; + if (CDMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, + gpu_split, screen, host_cut_coulsq, host_special_coul, + qqrd2e, e_shift, f_shift, alpha); + + CDMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + CDMF.estimate_gpu_overhead(); + return init_ok; +} + +void cdsf_gpu_clear() { + CDMF.clear(); +} + +int** cdsf_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return CDMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void cdsf_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + CDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, + vflag,eatom,vatom,host_start,cpu_time,success,host_q, + nlocal,boxlo,prd); +} + +double cdsf_gpu_bytes() { + return CDMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_dipole_lj.cpp b/lib/gpu/lal_dipole_lj.cpp new file mode 100644 index 0000000000..41ec84648d --- /dev/null +++ b/lib/gpu/lal_dipole_lj.cpp @@ -0,0 +1,170 @@ +/*************************************************************************** + dipole_lj.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the dipole/cut pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "dipole_lj_cl.h" +#elif defined(USE_CUDART) +const char *dipole_lj=0; +#else +#include "dipole_lj_cubin.h" +#endif + +#include "lal_dipole_lj.h" +#include +using namespace LAMMPS_AL; +#define DipoleLJT DipoleLJ + +extern Device device; + +template +DipoleLJT::DipoleLJ() : BaseDipole(), + _allocated(false) { +} + +template +DipoleLJT::~DipoleLJ() { + clear(); +} + +template +int DipoleLJT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int DipoleLJT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,dipole_lj,"k_dipole_lj"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cut_ljsq, host_cut_coulsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _qqrd2e=qqrd2e; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+ + sp_lj.row_bytes(); + return 0; +} + +template +void DipoleLJT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + cutsq.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double DipoleLJT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(DipoleLJ); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void DipoleLJT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->q, + &this->atom->quat, &cutsq, + &_qqrd2e, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &lj1, &lj3, + &_lj_types, &sp_lj, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->q, + &this->atom->quat, &cutsq, + &_qqrd2e, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class DipoleLJ; diff --git a/lib/gpu/lal_dipole_lj.cu b/lib/gpu/lal_dipole_lj.cu new file mode 100644 index 0000000000..6ab0b3037d --- /dev/null +++ b/lib/gpu/lal_dipole_lj.cu @@ -0,0 +1,501 @@ +// ************************************************************************** +// dipole_lj.cu +// ------------------- +// Trung Dac Nguyen (ORNL) +// +// Device code for acceleration of the dipole/cut pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : nguyentd@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_aux_fun1.h" + +#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid, \ + t_per_atom, offset, eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + __local acctyp red_acc[8][BLOCK_PAIR]; \ + red_acc[0][tid]=f.x; \ + red_acc[1][tid]=f.y; \ + red_acc[2][tid]=f.z; \ + red_acc[3][tid]=tor.x; \ + red_acc[4][tid]=tor.y; \ + red_acc[5][tid]=tor.z; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + f.x=red_acc[0][tid]; \ + f.y=red_acc[1][tid]; \ + f.z=red_acc[2][tid]; \ + tor.x=red_acc[3][tid]; \ + tor.y=red_acc[4][tid]; \ + tor.z=red_acc[5][tid]; \ + if (eflag>0 || vflag>0) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid]=virial[r]; \ + red_acc[6][tid]=energy; \ + red_acc[7][tid]=ecoul; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<8; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + for (int r=0; r<6; r++) \ + virial[r]=red_acc[r][tid]; \ + energy=red_acc[6][tid]; \ + ecoul=red_acc[7][tid]; \ + } \ + } \ + if (offset==0) { \ + engv+=ii; \ + if (eflag>0) { \ + *engv=energy; \ + engv+=inum; \ + *engv=e_coul; \ + engv+=inum; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *engv=virial[i]; \ + engv+=inum; \ + } \ + } \ + ans[ii]=f; \ + ans[ii+inum]=tor; \ + } + +#ifndef _DOUBLE_DOUBLE +texture pos_tex; +texture q_tex; +texture mu_tex; +#else +texture pos_tex; +texture q_tex; +texture mu_tex; +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#define mu_tex mu_ +#endif + +__kernel void k_dipole_lj(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_ , + __global numtyp4 *mu_, + __global numtyp *cutsq, const numtyp qqrd2e, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp4 tor; + tor.x=(acctyp)0; + tor.y=(acctyp)0; + tor.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii (numtyp)0.0 && muj.w > (numtyp)0.0) { + r3inv = r2inv*rinv; + r5inv = r3inv*r2inv; + r7inv = r5inv*r2inv; + pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z; + pidotr = mui.x*delx + mui.y*dely + mui.z*delz; + pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; + + pre1 = (numtyp)3.0*r5inv*pdotp - (numtyp)15.0*r7inv*pidotr*pjdotr; + pre2 = (numtyp)3.0*r5inv*pjdotr; + pre3 = (numtyp)3.0*r5inv*pidotr; + pre4 = (numtyp)(-1.0)*r3inv; + + forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x; + forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y; + forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z; + + numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y); + numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z); + numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x); + + ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely); + ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz); + ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx); + } + + // dipole-charge + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { + r3inv = r2inv*rinv; + r5inv = r3inv*r2inv; + pidotr = mui.x*delx + mui.y*dely + mui.z*delz; + pre1 = (numtyp)3.0*qj*r5inv * pidotr; + pre2 = qj*r3inv; + + forcecoul.x += pre2*mui.x - pre1*delx; + forcecoul.y += pre2*mui.y - pre1*dely; + forcecoul.z += pre2*mui.z - pre1*delz; + ticoul.x += pre2 * (mui.y*delz - mui.z*dely); + ticoul.y += pre2 * (mui.z*delx - mui.x*delz); + ticoul.z += pre2 * (mui.x*dely - mui.y*delx); + } + + // charge-dipole + if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) { + r3inv = r2inv*rinv; + r5inv = r3inv*r2inv; + pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; + pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr; + pre2 = qtmp*r3inv; + + forcecoul.x += pre1*delx - pre2*muj.x; + forcecoul.y += pre1*dely - pre2*muj.y; + forcecoul.z += pre1*delz - pre2*muj.z; + } + } else { + forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0; + ticoul.x = ticoul.y = ticoul.z = (acctyp)0; + } + + numtyp fq = factor_coul*qqrd2e; + force.x = fq*forcecoul.x + delx*force_lj; + force.y = fq*forcecoul.y + dely*force_lj; + force.z = fq*forcecoul.z + delz*force_lj; + f.x+=force.x; + f.y+=force.y; + f.z+=force.z; + tor.x+=fq*ticoul.x; + tor.y+=fq*ticoul.y; + tor.z+=fq*ticoul.z; + + if (eflag>0) { + acctyp e = (acctyp)0.0; + if (rsq < lj1[mtype].w) { + e = qtmp*qj*rinv; + if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) + e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr; + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) + e += -qj*r3inv*pidotr; + if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) + e += qtmp*r3inv*pjdotr; + e *= fq; + } else e = (acctyp)0.0; + e_coul += e; + + if (rsq < lj1[mtype].z) { + e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*force.x; + virial[1] += dely*force.y; + virial[2] += delz*force.z; + virial[3] += delx*force.y; + virial[4] += delx*force.z; + virial[5] += dely*force.z; + } + } + + } // for nbor + store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + +__kernel void k_dipole_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + __global numtyp4 *mu_, + __global numtyp *_cutsq, const numtyp qqrd2e, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp4 tor; + tor.x=(acctyp)0; + tor.y=(acctyp)0; + tor.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii (numtyp)0.0 && muj.w > (numtyp)0.0) { + r3inv = r2inv*rinv; + r5inv = r3inv*r2inv; + r7inv = r5inv*r2inv; + pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z; + pidotr = mui.x*delx + mui.y*dely + mui.z*delz; + pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; + + pre1 = (numtyp)3.0*r5inv*pdotp - (numtyp)15.0*r7inv*pidotr*pjdotr; + pre2 = (numtyp)3.0*r5inv*pjdotr; + pre3 = (numtyp)3.0*r5inv*pidotr; + pre4 = (numtyp)(-1.0)*r3inv; + + forcecoul.x += pre1*delx + pre2*mui.x + pre3*muj.x; + forcecoul.y += pre1*dely + pre2*mui.y + pre3*muj.y; + forcecoul.z += pre1*delz + pre2*mui.z + pre3*muj.z; + + numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y); + numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z); + numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x); + + ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely); + ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz); + ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx); + } + + // dipole-charge + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { + r3inv = r2inv*rinv; + r5inv = r3inv*r2inv; + pidotr = mui.x*delx + mui.y*dely + mui.z*delz; + pre1 = (numtyp)3.0*qj*r5inv * pidotr; + pre2 = qj*r3inv; + + forcecoul.x += pre2*mui.x - pre1*delx; + forcecoul.y += pre2*mui.y - pre1*dely; + forcecoul.z += pre2*mui.z - pre1*delz; + ticoul.x += pre2 * (mui.y*delz - mui.z*dely); + ticoul.y += pre2 * (mui.z*delx - mui.x*delz); + ticoul.z += pre2 * (mui.x*dely - mui.y*delx); + } + + // charge-dipole + if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) { + r3inv = r2inv*rinv; + r5inv = r3inv*r2inv; + pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; + pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr; + pre2 = qtmp*r3inv; + + forcecoul.x += pre1*delx - pre2*muj.x; + forcecoul.y += pre1*dely - pre2*muj.y; + forcecoul.z += pre1*delz - pre2*muj.z; + } + } else { + forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0; + ticoul.x = ticoul.y = ticoul.z = (acctyp)0; + } + + numtyp fq = factor_coul*qqrd2e; + force.x = fq*forcecoul.x + delx*force_lj; + force.y = fq*forcecoul.y + dely*force_lj; + force.z = fq*forcecoul.z + delz*force_lj; + + f.x+=force.x; + f.y+=force.y; + f.z+=force.z; + tor.x+=fq*ticoul.x; + tor.y+=fq*ticoul.y; + tor.z+=fq*ticoul.z; + + if (eflag>0) { + acctyp e = (acctyp)0; + if (rsq < lj1[mtype].w) { + e = qtmp*qj*rinv; + if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) + e += r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr; + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) + e += -qj*r3inv*pidotr; + if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) + e += qtmp*r3inv*pjdotr; + e *= fq; + } else e = (acctyp)0; + e_coul += e; + + if (rsq < lj1[mtype].z) { + e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*force.x; + virial[1] += dely*force.y; + virial[2] += delz*force.z; + virial[3] += delx*force.y; + virial[4] += delx*force.z; + virial[5] += dely*force.z; + } + } + + } // for nbor + store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_dipole_lj.h b/lib/gpu/lal_dipole_lj.h new file mode 100644 index 0000000000..b08b7a8669 --- /dev/null +++ b/lib/gpu/lal_dipole_lj.h @@ -0,0 +1,85 @@ +/*************************************************************************** + dipole_lj.h + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the dipole/cut pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifndef LAL_DIPOLE_LJ_H +#define LAL_DIPOLE_LJ_H + +#include "lal_base_dipole.h" + +namespace LAMMPS_AL { + +template +class DipoleLJ : public BaseDipole { + public: + DipoleLJ(); + ~DipoleLJ(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + double **host_cut_coulsq, double *host_special_coul, + const double qqrd2e); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec lj3; + /// cutsq + UCL_D_Vec cutsq; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _qqrd2e; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_dipole_lj_ext.cpp b/lib/gpu/lal_dipole_lj_ext.cpp new file mode 100644 index 0000000000..05d8fd9f72 --- /dev/null +++ b/lib/gpu/lal_dipole_lj_ext.cpp @@ -0,0 +1,128 @@ +/*************************************************************************** + dipole_lj_ext.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Functions for LAMMPS access to dipole/cut acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "lal_dipole_lj.h" + +using namespace std; +using namespace LAMMPS_AL; + +static DipoleLJ DPLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { + DPLMF.clear(); + gpu_mode=DPLMF.device->gpu_mode(); + double gpu_split=DPLMF.device->particle_split(); + int first_gpu=DPLMF.device->first_device(); + int last_gpu=DPLMF.device->last_device(); + int world_me=DPLMF.device->world_me(); + int gpu_rank=DPLMF.device->gpu_rank(); + int procs_per_gpu=DPLMF.device->procs_per_gpu(); + + DPLMF.device->init_message(screen,"dipole/cut",first_gpu,last_gpu); + + bool message=false; + if (DPLMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e); + + DPLMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + DPLMF.estimate_gpu_overhead(); + return init_ok; +} + +void dpl_gpu_clear() { + DPLMF.clear(); +} + +int** dpl_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double **host_mu, + double *boxlo, double *prd) { + return DPLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, host_mu, boxlo, prd); +} + +void dpl_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + double **host_mu, const int nlocal, double *boxlo, double *prd) { + DPLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, + vflag,eatom,vatom,host_start,cpu_time,success,host_q,host_mu, + nlocal,boxlo,prd); +} + +double dpl_gpu_bytes() { + return DPLMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_dipole_lj_sf.cpp b/lib/gpu/lal_dipole_lj_sf.cpp new file mode 100644 index 0000000000..4e962dc8c7 --- /dev/null +++ b/lib/gpu/lal_dipole_lj_sf.cpp @@ -0,0 +1,170 @@ +/*************************************************************************** + dipole_lj_sf.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the dipole/sf pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "dipole_lj_sf_cl.h" +#elif defined(USE_CUDART) +const char *dipole_lj_sf=0; +#else +#include "dipole_lj_sf_cubin.h" +#endif + +#include "lal_dipole_lj_sf.h" +#include +using namespace LAMMPS_AL; +#define DipoleLJSFT DipoleLJSF + +extern Device device; + +template +DipoleLJSFT::DipoleLJSF() : BaseDipole(), + _allocated(false) { +} + +template +DipoleLJSFT::~DipoleLJSF() { + clear(); +} + +template +int DipoleLJSFT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int DipoleLJSFT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,dipole_lj_sf,"k_dipole_lj_sf"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cut_ljsq, host_cut_coulsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_cutsq); + + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _qqrd2e=qqrd2e; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+ + sp_lj.row_bytes(); + return 0; +} + +template +void DipoleLJSFT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + cutsq.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double DipoleLJSFT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(DipoleLJSF); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->q, + &this->atom->quat, &cutsq, + &_qqrd2e, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &lj1, &lj3, + &_lj_types, &sp_lj, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->q, + &this->atom->quat, &cutsq, + &_qqrd2e, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class DipoleLJSF; diff --git a/lib/gpu/lal_dipole_lj_sf.cu b/lib/gpu/lal_dipole_lj_sf.cu new file mode 100644 index 0000000000..16d5463cb1 --- /dev/null +++ b/lib/gpu/lal_dipole_lj_sf.cu @@ -0,0 +1,562 @@ +// ************************************************************************** +// dipole_lj_sf.cu +// ------------------- +// Trung Dac Nguyen (ORNL) +// +// Device code for acceleration of the dipole/sf pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : nguyentd@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_aux_fun1.h" + +#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid, \ + t_per_atom, offset, eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + __local acctyp red_acc[8][BLOCK_PAIR]; \ + red_acc[0][tid]=f.x; \ + red_acc[1][tid]=f.y; \ + red_acc[2][tid]=f.z; \ + red_acc[3][tid]=tor.x; \ + red_acc[4][tid]=tor.y; \ + red_acc[5][tid]=tor.z; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + f.x=red_acc[0][tid]; \ + f.y=red_acc[1][tid]; \ + f.z=red_acc[2][tid]; \ + tor.x=red_acc[3][tid]; \ + tor.y=red_acc[4][tid]; \ + tor.z=red_acc[5][tid]; \ + if (eflag>0 || vflag>0) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid]=virial[r]; \ + red_acc[6][tid]=energy; \ + red_acc[7][tid]=ecoul; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<8; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + for (int r=0; r<6; r++) \ + virial[r]=red_acc[r][tid]; \ + energy=red_acc[6][tid]; \ + ecoul=red_acc[7][tid]; \ + } \ + } \ + if (offset==0) { \ + engv+=ii; \ + if (eflag>0) { \ + *engv=energy; \ + engv+=inum; \ + *engv=e_coul; \ + engv+=inum; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *engv=virial[i]; \ + engv+=inum; \ + } \ + } \ + ans[ii]=f; \ + ans[ii+inum]=tor; \ + } + +#ifndef _DOUBLE_DOUBLE +texture pos_tex; +texture q_tex; +texture mu_tex; +#else +texture pos_tex; +texture q_tex; +texture mu_tex; +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#define mu_tex mu_ +#endif + +__kernel void k_dipole_lj_sf(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_ , + __global numtyp4 *mu_, + __global numtyp *cutsq, const numtyp qqrd2e, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp4 tor; + tor.x=(acctyp)0; + tor.y=(acctyp)0; + tor.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii (numtyp)0.0 && muj.w > (numtyp)0.0) { + r3inv = r2inv*rinv; + r5inv = r3inv*r2inv; + + pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z; + pidotr = mui.x*delx + mui.y*dely + mui.z*delz; + pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; + + afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv; + pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr); + aforcecoul.x = pre1*delx; + aforcecoul.y = pre1*dely; + aforcecoul.z = pre1*delz; + + bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+ + (numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv; + presf = (numtyp)2.0*r2inv*pidotr*pjdotr; + bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx); + bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely); + bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz); + + forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x); + forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y); + forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z); + + pre2 = (numtyp)3.0*bfac*r5inv*pjdotr; + pre4 = -bfac*r3inv; + + numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y); + numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z); + numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x); + + ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely); + ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz); + ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx); + } + + // dipole-charge + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { + r3inv = r2inv*rinv; + r5inv = r3inv*r2inv; + pidotr = mui.x*delx + mui.y*dely + mui.z*delz; + rcutcoul2inv=ucl_recip(lj1[mtype].w); + pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv); + pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + + (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv); + pre2 = qj*r3inv * pqfac; + + forcecoul.x += pre2*mui.x - pre1*delx; + forcecoul.y += pre2*mui.y - pre1*dely; + forcecoul.z += pre2*mui.z - pre1*delz; + ticoul.x += pre2 * (mui.y*delz - mui.z*dely); + ticoul.y += pre2 * (mui.z*delx - mui.x*delz); + ticoul.z += pre2 * (mui.x*dely - mui.y*delx); + } + + // charge-dipole + if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) { + r3inv = r2inv*rinv; + r5inv = r3inv*r2inv; + pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; + rcutcoul2inv=ucl_recip(lj1[mtype].w); + pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv); + qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + + (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv); + pre2 = qtmp*r3inv * qpfac; + + forcecoul.x += pre1*delx - pre2*muj.x; + forcecoul.y += pre1*dely - pre2*muj.y; + forcecoul.z += pre1*delz - pre2*muj.z; + } + } else { + forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0; + ticoul.x = ticoul.y = ticoul.z = (acctyp)0; + } + + numtyp fq = factor_coul*qqrd2e; + force.x = fq*forcecoul.x + delx*force_lj; + force.y = fq*forcecoul.y + dely*force_lj; + force.z = fq*forcecoul.z + delz*force_lj; + f.x+=force.x; + f.y+=force.y; + f.z+=force.z; + tor.x+=fq*ticoul.x; + tor.y+=fq*ticoul.y; + tor.z+=fq*ticoul.z; + + if (eflag>0) { + acctyp e = (acctyp)0.0; + if (rsq < lj1[mtype].w) { + numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv); + e = qtmp*qj*rinv*fac*fac; + if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) + e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr); + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) + e += -qj*r3inv*pidotr * pqfac; + if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) + e += qtmp*r3inv*pjdotr * qpfac; + e *= fq; + } else e = (acctyp)0.0; + e_coul += e; + + if (rsq < lj1[mtype].z) { + e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) + + rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - + (numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv + + rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + + (numtyp)4.0*lj3[mtype].y); + energy+=factor_lj*e; + } + } + if (vflag>0) { + virial[0] += delx*force.x; + virial[1] += dely*force.y; + virial[2] += delz*force.z; + virial[3] += delx*force.y; + virial[4] += delx*force.z; + virial[5] += dely*force.z; + } + } + } // for nbor + store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + +__kernel void k_dipole_lj_sf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + __global numtyp4 *mu_, + __global numtyp *_cutsq, const numtyp qqrd2e, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp4 tor; + tor.x=(acctyp)0; + tor.y=(acctyp)0; + tor.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii (numtyp)0.0 && muj.w > (numtyp)0.0) { + r3inv = r2inv*rinv; + r5inv = r3inv*r2inv; + + pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z; + pidotr = mui.x*delx + mui.y*dely + mui.z*delz; + pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; + + afac = (numtyp)1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv; + pre1 = afac * (pdotp - (numtyp)3.0*r2inv*pidotr*pjdotr); + aforcecoul.x = pre1*delx; + aforcecoul.y = pre1*dely; + aforcecoul.z = pre1*delz; + + bfac = (numtyp)1.0-(numtyp)4.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv)+ + (numtyp)3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv; + presf = (numtyp)2.0*r2inv*pidotr*pjdotr; + bforcecoul.x = bfac * (pjdotr*mui.x+pidotr*muj.x-presf*delx); + bforcecoul.y = bfac * (pjdotr*mui.y+pidotr*muj.y-presf*dely); + bforcecoul.z = bfac * (pjdotr*mui.z+pidotr*muj.z-presf*delz); + + forcecoul.x += (numtyp)3.0*r5inv*(aforcecoul.x + bforcecoul.x); + forcecoul.y += (numtyp)3.0*r5inv*(aforcecoul.y + bforcecoul.y); + forcecoul.z += (numtyp)3.0*r5inv*(aforcecoul.z + bforcecoul.z); + + pre2 = (numtyp)3.0*bfac*r5inv*pjdotr; + pre4 = -bfac*r3inv; + + numtyp crossx = pre4 * (mui.y*muj.z - mui.z*muj.y); + numtyp crossy = pre4 * (mui.z*muj.x - mui.x*muj.z); + numtyp crossz = pre4 * (mui.x*muj.y - mui.y*muj.x); + + ticoul.x += crossx + pre2 * (mui.y*delz - mui.z*dely); + ticoul.y += crossy + pre2 * (mui.z*delx - mui.x*delz); + ticoul.z += crossz + pre2 * (mui.x*dely - mui.y*delx); + } + + // dipole-charge + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) { + r3inv = r2inv*rinv; + r5inv = r3inv*r2inv; + pidotr = mui.x*delx + mui.y*dely + mui.z*delz; + pre1 = (numtyp)3.0*qj*r5inv * pidotr*((numtyp)1.0-rsq*rcutcoul2inv); + pqfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + + (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv); + pre2 = qj*r3inv * pqfac; + + forcecoul.x += pre2*mui.x - pre1*delx; + forcecoul.y += pre2*mui.y - pre1*dely; + forcecoul.z += pre2*mui.z - pre1*delz; + ticoul.x += pre2 * (mui.y*delz - mui.z*dely); + ticoul.y += pre2 * (mui.z*delx - mui.x*delz); + ticoul.z += pre2 * (mui.x*dely - mui.y*delx); + } + + // charge-dipole + if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) { + r3inv = r2inv*rinv; + r5inv = r3inv*r2inv; + pjdotr = muj.x*delx + muj.y*dely + muj.z*delz; + + pre1 = (numtyp)3.0*qtmp*r5inv * pjdotr*((numtyp)1.0-rsq*rcutcoul2inv); + qpfac = (numtyp)1.0 - (numtyp)3.0*rsq*rcutcoul2inv + + (numtyp)2.0*rsq*ucl_sqrt(rsq)*rcutcoul2inv*ucl_sqrt(rcutcoul2inv); + pre2 = qtmp*r3inv * qpfac; + + forcecoul.x += pre1*delx - pre2*muj.x; + forcecoul.y += pre1*dely - pre2*muj.y; + forcecoul.z += pre1*delz - pre2*muj.z; + } + } else { + forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0; + ticoul.x = ticoul.y = ticoul.z = (acctyp)0; + } + + numtyp fq = factor_coul*qqrd2e; + force.x = fq*forcecoul.x + delx*force_lj; + force.y = fq*forcecoul.y + dely*force_lj; + force.z = fq*forcecoul.z + delz*force_lj; + f.x+=force.x; + f.y+=force.y; + f.z+=force.z; + tor.x+=fq*ticoul.x; + tor.y+=fq*ticoul.y; + tor.z+=fq*ticoul.z; + + if (eflag>0) { + acctyp e = (acctyp)0.0; + if (rsq < lj1[mtype].w) { + numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv); + e = qtmp*qj*rinv*fac*fac; + if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) + e += bfac* (r3inv*pdotp - (numtyp)3.0*r5inv*pidotr*pjdotr); + if (mui.w > (numtyp)0.0 && qj != (numtyp)0.0) + e += -qj*r3inv*pidotr * pqfac; + if (muj.w > (numtyp)0.0 && qtmp != (numtyp)0.0) + e += qtmp*r3inv*pjdotr * qpfac; + e *= fq; + } else e = (acctyp)0.0; + e_coul += e; + + if (rsq < lj1[mtype].z) { + e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y) + + rcutlj6inv*((numtyp)6.0*lj3[mtype].x*rcutlj6inv - + (numtyp)3.0*lj3[mtype].y)*rsq*rcutlj2inv + + rcutlj6inv*((numtyp)(-7.0)*lj3[mtype].x*rcutlj6inv + + (numtyp)4.0*lj3[mtype].y); + energy+=factor_lj*e; + } + } + if (vflag>0) { + virial[0] += delx*force.x; + virial[1] += dely*force.y; + virial[2] += delz*force.z; + virial[3] += delx*force.y; + virial[4] += delx*force.z; + virial[5] += dely*force.z; + } + } + + } // for nbor + store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_dipole_lj_sf.h b/lib/gpu/lal_dipole_lj_sf.h new file mode 100644 index 0000000000..83cea4c2a4 --- /dev/null +++ b/lib/gpu/lal_dipole_lj_sf.h @@ -0,0 +1,85 @@ +/*************************************************************************** + dipole_lj_sf.h + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the dipole/sf pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifndef LAL_DIPOLE_LJ_SF_H +#define LAL_DIPOLE_LJ_SF_H + +#include "lal_base_dipole.h" + +namespace LAMMPS_AL { + +template +class DipoleLJSF : public BaseDipole { + public: + DipoleLJSF(); + ~DipoleLJSF(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + double **host_cut_coulsq, double *host_special_coul, + const double qqrd2e); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = cutsq + UCL_D_Vec lj3; + /// cutsq + UCL_D_Vec cutsq; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _qqrd2e; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_dipole_lj_sf_ext.cpp b/lib/gpu/lal_dipole_lj_sf_ext.cpp new file mode 100644 index 0000000000..53ef66fca6 --- /dev/null +++ b/lib/gpu/lal_dipole_lj_sf_ext.cpp @@ -0,0 +1,128 @@ +/*************************************************************************** + dipole_lj_sf_ext.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Functions for LAMMPS access to dipole/sf acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "lal_dipole_lj_sf.h" + +using namespace std; +using namespace LAMMPS_AL; + +static DipoleLJSF DPLSFMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { + DPLSFMF.clear(); + gpu_mode=DPLSFMF.device->gpu_mode(); + double gpu_split=DPLSFMF.device->particle_split(); + int first_gpu=DPLSFMF.device->first_device(); + int last_gpu=DPLSFMF.device->last_device(); + int world_me=DPLSFMF.device->world_me(); + int gpu_rank=DPLSFMF.device->gpu_rank(); + int procs_per_gpu=DPLSFMF.device->procs_per_gpu(); + + DPLSFMF.device->init_message(screen,"dipole/sf",first_gpu,last_gpu); + + bool message=false; + if (DPLSFMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e); + + DPLSFMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + DPLSFMF.estimate_gpu_overhead(); + return init_ok; +} + +void dplsf_gpu_clear() { + DPLSFMF.clear(); +} + +int** dplsf_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double **host_mu, + double *boxlo, double *prd) { + return DPLSFMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, host_mu, boxlo, prd); +} + +void dplsf_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + double **host_mu, const int nlocal, double *boxlo, double *prd) { + DPLSFMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, + vflag,eatom,vatom,host_start,cpu_time,success,host_q,host_mu, + nlocal,boxlo,prd); +} + +double dplsf_gpu_bytes() { + return DPLSFMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_gauss.cpp b/lib/gpu/lal_gauss.cpp new file mode 100644 index 0000000000..d05573f751 --- /dev/null +++ b/lib/gpu/lal_gauss.cpp @@ -0,0 +1,146 @@ +/*************************************************************************** + gauss.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the gauss pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "gauss_cl.h" +#elif defined(USE_CUDART) +const char *gauss=0; +#else +#include "gauss_cubin.h" +#endif + +#include "lal_gauss.h" +#include +using namespace LAMMPS_AL; +#define GaussT Gauss + +extern Device device; + +template +GaussT::Gauss() : BaseAtomic(), _allocated(false) { +} + +template +GaussT::~Gauss() { + clear(); +} + +template +int GaussT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int GaussT::init(const int ntypes, + double **host_cutsq, double **host_a, + double **host_b, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,gauss,"k_gauss"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,gauss1,host_write,host_a,host_b, + host_cutsq,host_offset); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + _allocated=true; + this->_max_bytes=gauss1.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void GaussT::clear() { + if (!_allocated) + return; + _allocated=false; + + gauss1.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double GaussT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(Gauss); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void GaussT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->x, &gauss1, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &gauss1, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class Gauss; diff --git a/lib/gpu/lal_gauss.cu b/lib/gpu/lal_gauss.cu new file mode 100644 index 0000000000..b078d17c3b --- /dev/null +++ b/lib/gpu/lal_gauss.cu @@ -0,0 +1,189 @@ +// ************************************************************************** +// gauss.cu +// ------------------- +// Trung Dac Nguyen (ORNL) +// +// Device code for acceleration of the gauss pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : nguyentd@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +texture pos_tex; +#else +texture pos_tex; +#endif +#else +#define pos_tex x_ +#endif + +__kernel void k_gauss(__global numtyp4 *x_, __global numtyp4 *gauss1, + const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[4]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii0) { + numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - + gauss1[mtype].w); + energy+=factor_lj*e; + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + +__kernel void k_gauss_fast(__global numtyp4 *x_, __global numtyp4 *gauss1_in, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 gauss1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + if (tid<4) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) { + numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) - + gauss1[mtype].w); + energy+=factor_lj*e; + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_gauss.h b/lib/gpu/lal_gauss.h new file mode 100644 index 0000000000..fad790899d --- /dev/null +++ b/lib/gpu/lal_gauss.h @@ -0,0 +1,77 @@ +/*************************************************************************** + gauss.h + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the gauss pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifndef LAL_GAUSS_H +#define LAL_GAYSS_H + +#include "lal_base_atomic.h" + +namespace LAMMPS_AL { + +template +class Gauss : public BaseAtomic { + public: + Gauss(); + ~Gauss(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_a, double **host_b, double **host_offset, + double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// gauss1.x = a, gauss1.y = b, gauss1.z = cutsq, gauss1.w = offset + UCL_D_Vec gauss1; + /// Special LJ values + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_gauss_ext.cpp b/lib/gpu/lal_gauss_ext.cpp new file mode 100644 index 0000000000..4d95c7cfb9 --- /dev/null +++ b/lib/gpu/lal_gauss_ext.cpp @@ -0,0 +1,120 @@ +/*************************************************************************** + gauss_ext.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Functions for LAMMPS access to gauss acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "lal_gauss.h" + +using namespace std; +using namespace LAMMPS_AL; + +static Gauss GLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, + double **host_b, double **offset, double *special_lj, + const int inum, const int nall, const int max_nbors, + const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + GLMF.clear(); + gpu_mode=GLMF.device->gpu_mode(); + double gpu_split=GLMF.device->particle_split(); + int first_gpu=GLMF.device->first_device(); + int last_gpu=GLMF.device->last_device(); + int world_me=GLMF.device->world_me(); + int gpu_rank=GLMF.device->gpu_rank(); + int procs_per_gpu=GLMF.device->procs_per_gpu(); + + GLMF.device->init_message(screen,"gauss",first_gpu,last_gpu); + + bool message=false; + if (GLMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=GLMF.init(ntypes, cutsq, host_a, host_b, + offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen); + + GLMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + GLMF.estimate_gpu_overhead(); + return init_ok; +} + +void gauss_gpu_clear() { + GLMF.clear(); +} + +int ** gauss_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success) { + return GLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success); +} + +void gauss_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + GLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); +} + +double gauss_gpu_bytes() { + return GLMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_lj_coul_debye.cpp b/lib/gpu/lal_lj_coul_debye.cpp new file mode 100644 index 0000000000..54f5fe7ce0 --- /dev/null +++ b/lib/gpu/lal_lj_coul_debye.cpp @@ -0,0 +1,168 @@ +/*************************************************************************** + lj_coul_debye.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the lj/cut/coul/debye pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "lj_coul_debye_cl.h" +#elif defined(USE_CUDART) +const char *lj_coul_debye=0; +#else +#include "lj_coul_debye_cubin.h" +#endif + +#include "lal_lj_coul_debye.h" +#include +using namespace LAMMPS_AL; +#define LJCoulDebyeT LJCoulDebye + +extern Device device; + +template +LJCoulDebyeT::LJCoulDebye() : BaseCharge(), + _allocated(false) { +} + +template +LJCoulDebyeT::~LJCoulDebye() { + clear(); +} + +template +int LJCoulDebyeT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int LJCoulDebyeT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double kappa) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_coul_debye,"k_lj_debye"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cut_ljsq, host_cut_coulsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _qqrd2e=qqrd2e; + _kappa=kappa; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+ + sp_lj.row_bytes(); + return 0; +} + +template +void LJCoulDebyeT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + cutsq.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double LJCoulDebyeT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJCoulDebye); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->q, &cutsq, + &_qqrd2e, &_kappa, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->q, &cutsq, + &_qqrd2e, &_kappa, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class LJCoulDebye; diff --git a/lib/gpu/lal_lj_coul_debye.cu b/lib/gpu/lal_lj_coul_debye.cu new file mode 100644 index 0000000000..e63d7a40e3 --- /dev/null +++ b/lib/gpu/lal_lj_coul_debye.cu @@ -0,0 +1,256 @@ +// ************************************************************************** +// lj_coul_debye.cu +// ------------------- +// Trung Dac Nguyen (ORNL) +// +// Device code for acceleration of the lj/cut/coul/debye pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : nguyentd@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL + +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +texture pos_tex; +texture q_tex; +#else +texture pos_tex; +texture q_tex; +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_lj_debye_pair(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_ , + __global numtyp *cutsq, const numtyp qqrd2e, + const numtyp kappa, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii0) { + if (rsq < lj1[mtype].z) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (rsq < lj1[mtype].w) { + e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul; + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + +__kernel void k_lj_debye_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, + __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + __global numtyp *_cutsq, const numtyp qqrd2e, + const numtyp kappa, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii0) { + if (rsq < lj1[mtype].z) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + if (rsq < lj1[mtype].w) { + e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul; + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_lj_coul_debye.h b/lib/gpu/lal_lj_coul_debye.h new file mode 100644 index 0000000000..73e38c647d --- /dev/null +++ b/lib/gpu/lal_lj_coul_debye.h @@ -0,0 +1,85 @@ +/*************************************************************************** + lj_coul_debye.h + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the lj/cut/coul/debye pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifndef LAL_LJ_COUL_DEBYE_H +#define LAL_LJ_COUL_DEBYE_H + +#include "lal_base_charge.h" + +namespace LAMMPS_AL { + +template +class LJCoulDebye : public BaseCharge { + public: + LJCoulDebye(); + ~LJCoulDebye(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + double **host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double kappa); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec lj3; + /// cutsq + UCL_D_Vec cutsq; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _qqrd2e,_kappa; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_lj_coul_debye_ext.cpp b/lib/gpu/lal_lj_coul_debye_ext.cpp new file mode 100644 index 0000000000..3e5aab4d9d --- /dev/null +++ b/lib/gpu/lal_lj_coul_debye_ext.cpp @@ -0,0 +1,129 @@ +/*************************************************************************** + lj_coul_debye_ext.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Functions for LAMMPS access to lj/cut/coul/debye acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "lal_lj_coul_debye.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJCoulDebye LJCDMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double kappa) { + LJCDMF.clear(); + gpu_mode=LJCDMF.device->gpu_mode(); + double gpu_split=LJCDMF.device->particle_split(); + int first_gpu=LJCDMF.device->first_device(); + int last_gpu=LJCDMF.device->last_device(); + int world_me=LJCDMF.device->world_me(); + int gpu_rank=LJCDMF.device->gpu_rank(); + int procs_per_gpu=LJCDMF.device->procs_per_gpu(); + + LJCDMF.device->init_message(screen,"lj/cut/coul/debye",first_gpu,last_gpu); + + bool message=false; + if (LJCDMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e, kappa); + + LJCDMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + LJCDMF.estimate_gpu_overhead(); + return init_ok; +} + +void ljcd_gpu_clear() { + LJCDMF.clear(); +} + +int** ljcd_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return LJCDMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void ljcd_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + LJCDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, + vflag,eatom,vatom,host_start,cpu_time,success,host_q, + nlocal,boxlo,prd); +} + +double ljcd_gpu_bytes() { + return LJCDMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_lj_dsf.cpp b/lib/gpu/lal_lj_dsf.cpp new file mode 100644 index 0000000000..fe13808d28 --- /dev/null +++ b/lib/gpu/lal_lj_dsf.cpp @@ -0,0 +1,168 @@ +/*************************************************************************** + lj_dsf.cpp + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the lj/cut/coul/dsf pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : 7/12/2012 + email : brownw@ornl.gov + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "lj_dsf_cl.h" +#elif defined(USE_CUDART) +const char *lj_dsf=0; +#else +#include "lj_dsf_cubin.h" +#endif + +#include "lal_lj_dsf.h" +#include +using namespace LAMMPS_AL; +#define LJDSFT LJDSF + +extern Device device; + +template +LJDSFT::LJDSF() : BaseCharge(), + _allocated(false) { +} + +template +LJDSFT::~LJDSF() { + clear(); +} + +template +int LJDSFT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double e_shift, const double f_shift, + const double alpha) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_dsf,"k_lj_dsf"); + if (success!=0) + return success; + + _cut_coulsq=host_cut_coulsq; + _e_shift=e_shift; + _f_shift=f_shift; + _alpha=alpha; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cut_ljsq, host_cutsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _qqrd2e=qqrd2e; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void LJDSFT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double LJDSFT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJDSF); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void LJDSFT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, + &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha, + &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, + &_cut_coulsq, &_qqrd2e, &_e_shift, &_f_shift, &_alpha, + &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class LJDSF; diff --git a/lib/gpu/lal_lj_dsf.cu b/lib/gpu/lal_lj_dsf.cu new file mode 100644 index 0000000000..9073d72884 --- /dev/null +++ b/lib/gpu/lal_lj_dsf.cu @@ -0,0 +1,261 @@ +// ************************************************************************** +// lj_dsf.cu +// ------------------- +// W. Michael Brown (ORNL) +// +// Device code for acceleration of the lj/cut/coul/dsf pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : 7/12/2012 +// email : brownw@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL + +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +texture pos_tex; +texture q_tex; +#else +texture pos_tex; +texture q_tex; +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +#define MY_PIS (acctyp)1.77245385090551602729 + +__kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1, + __global numtyp4* lj3, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_ , + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp e_shift, const numtyp f_shift, + const numtyp alpha, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii0) { + if (rsq < cut_coulsq) { + numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift); + e_coul += e; + } + if (rsq < lj1[mtype].z) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + +__kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, + __global numtyp4* lj3_in, __global numtyp* sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, __global numtyp *q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp e_shift, const numtyp f_shift, + const numtyp alpha, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) + lj3[tid]=lj3_in[tid]; + } + + acctyp energy=(acctyp)0; + acctyp e_coul=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + __syncthreads(); + + if (ii0) { + if (rsq < cut_coulsq) { + numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift); + e_coul += e; + } + if (rsq < lj1[mtype].z) { + numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y); + energy+=factor_lj*(e-lj3[mtype].z); + } + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_lj_dsf.h b/lib/gpu/lal_lj_dsf.h new file mode 100644 index 0000000000..5badf543c4 --- /dev/null +++ b/lib/gpu/lal_lj_dsf.h @@ -0,0 +1,85 @@ +/*************************************************************************** + lj_dsf.h + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the lj/cut/coul/dsf pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : 7/12/2012 + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_LJ_DSF_H +#define LAL_LJ_DSF_H + +#include "lal_base_charge.h" + +namespace LAMMPS_AL { + +template +class LJDSF : public BaseCharge { + public: + LJDSF(); + ~LJDSF(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double e_shift, const double f_shift, + const double alpha); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec lj3; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _qqrd2e; + + private: + bool _allocated; + numtyp _e_shift, _f_shift, _alpha, _cut_coulsq; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_lj_dsf_ext.cpp b/lib/gpu/lal_lj_dsf_ext.cpp new file mode 100644 index 0000000000..52b56f38c2 --- /dev/null +++ b/lib/gpu/lal_lj_dsf_ext.cpp @@ -0,0 +1,132 @@ +/*************************************************************************** + lj_dsf_ext.cpp + ------------------- + W. Michael Brown (ORNL) + + Functions for LAMMPS access to lj/cut/coul/dsf acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : 7/12/2012 + email : brownw@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "lal_lj_dsf.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJDSF LJDMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double e_shift, const double f_shift, + const double alpha) { + LJDMF.clear(); + gpu_mode=LJDMF.device->gpu_mode(); + double gpu_split=LJDMF.device->particle_split(); + int first_gpu=LJDMF.device->first_device(); + int last_gpu=LJDMF.device->last_device(); + int world_me=LJDMF.device->world_me(); + int gpu_rank=LJDMF.device->gpu_rank(); + int procs_per_gpu=LJDMF.device->procs_per_gpu(); + + LJDMF.device->init_message(screen,"lj/cut/coul/dsf",first_gpu,last_gpu); + + bool message=false; + if (LJDMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e, e_shift, + f_shift, alpha); + + LJDMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + LJDMF.estimate_gpu_overhead(); + return init_ok; +} + +void ljd_gpu_clear() { + LJDMF.clear(); +} + +int** ljd_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return LJDMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void ljd_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + LJDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, + vflag,eatom,vatom,host_start,cpu_time,success,host_q, + nlocal,boxlo,prd); +} + +double ljd_gpu_bytes() { + return LJDMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_yukawa_colloid.cpp b/lib/gpu/lal_yukawa_colloid.cpp new file mode 100644 index 0000000000..ee0571fd3e --- /dev/null +++ b/lib/gpu/lal_yukawa_colloid.cpp @@ -0,0 +1,315 @@ +/*************************************************************************** + yukawa_colloid.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the yukawa/colloid pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifdef USE_OPENCL +#include "yukawa_colloid_cl.h" +#elif defined(USE_CUDART) +const char *yukawa_colloid=0; +#else +#include "yukawa_colloid_cubin.h" +#endif + +#include "lal_yukawa_colloid.h" +#include +using namespace LAMMPS_AL; +#define YukawaColloidT YukawaColloid + +extern Device device; + +template +YukawaColloidT::YukawaColloid() : BaseAtomic(), +_allocated(false), _max_rad_size(0) { +} + +template +YukawaColloidT::~YukawaColloid() { + clear(); +} + +template +int YukawaColloidT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int YukawaColloidT::init(const int ntypes, + double **host_cutsq, double **host_a, + double **host_offset, double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, const double kappa) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,yukawa_colloid,"k_yukawa_colloid"); + if (success!=0) + return success; + + // allocate rad + + bool cpuview=false; + if (this->ucl_device->device_type()==UCL_CPU) + cpuview=true; + + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + + _max_rad_size=static_cast(static_cast(ef_nall)*1.10); + host_rad.alloc(_max_rad_size,*(this->ucl_device)); + if (cpuview) + dev_rad.view(host_rad); + else + dev_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY); + + rad_tex.get_texture(*(this->pair_program),"rad_tex"); + rad_tex.bind_float(dev_rad,1); + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + _kappa = kappa; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_OPTIMIZED); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a, + host_offset,host_cutsq); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + _allocated=true; + this->_max_bytes=coeff.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void YukawaColloidT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff.clear(); + sp_lj.clear(); + + host_rad.clear(); + dev_rad.clear(); + + this->clear_atomic(); +} + +template +double YukawaColloidT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(YukawaColloid); +} + +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then compute atom energies/forces +// --------------------------------------------------------------------------- +template +void YukawaColloidT::compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, int *ilist, + int *numj, int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *rad) { + this->acc_timers(); + + // ------------------- Resize rad array -------------------------- + + if (nall>_max_rad_size) { + dev_rad.clear(); + host_rad.clear(); + + _max_rad_size=static_cast(static_cast(nall)*1.10); + host_rad.alloc(_max_rad_size,*(this->ucl_device)); + + if (this->ucl_device->device_type()==UCL_CPU) { + if (sizeof(numtyp)==sizeof(double)) { + host_rad.view((numtyp*)rad,nall,*(this->ucl_device)); + dev_rad.view(host_rad); + } + } else { + dev_rad.alloc(_max_rad_size,*(this->ucl_device)); + } + + rad_tex.bind_float(dev_rad,1); + } + + // ---------------------------------------------------------------- + + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + this->resize_atom(0,nall,success); + this->zero_timers(); + return; + } + + int ago=this->hd_balancer.ago_first(f_ago); + int inum=this->hd_balancer.balance(ago,inum_full,cpu_time); + this->ans->inum(inum); + host_start=inum; + + // ----------------------------------------------------------------- + + if (ago==0) { + this->reset_nbors(nall, inum, ilist, numj, firstneigh, success); + if (!success) + return; + } + + this->atom->cast_x_data(host_x,host_type); + this->cast_rad_data(rad); + this->hd_balancer.start_timer(); + this->atom->add_x_data(host_x,host_type); + this->add_rad_data(); + + this->loop(eflag,vflag); + this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist); + this->device->add_ans_object(this->ans); + this->hd_balancer.stop_timer(); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU and then compute per-atom densities +// --------------------------------------------------------------------------- +template +int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double *rad) { + this->acc_timers(); + + // ------------------- Resize rad array ---------------------------- + + if (nall>_max_rad_size) { + dev_rad.clear(); + host_rad.clear(); + + _max_rad_size=static_cast(static_cast(nall)*1.10); + host_rad.alloc(_max_rad_size,*(this->ucl_device)); + + if (this->ucl_device->device_type()==UCL_CPU) { + if (sizeof(numtyp)==sizeof(double)) { + host_rad.view((numtyp*)rad,nall,*(this->ucl_device)); + dev_rad.view(host_rad); + } + } else { + dev_rad.alloc(_max_rad_size,*(this->ucl_device)); + } + + rad_tex.bind_float(dev_rad,1); + } + + // ----------------------------------------------------------------- + + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + this->resize_atom(0,nall,success); + this->zero_timers(); + return NULL; + } + + // load balance, returning the atom count on the device (inum) + this->hd_balancer.balance(cpu_time); + int inum=this->hd_balancer.get_gpu_count(ago,inum_full); + this->ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, success); + if (!success) + return NULL; + this->cast_rad_data(rad); + this->hd_balancer.start_timer(); + } else { + this->atom->cast_x_data(host_x,host_type); + this->cast_rad_data(rad); + this->hd_balancer.start_timer(); + this->atom->add_x_data(host_x,host_type); + } + this->add_rad_data(); + *ilist=this->nbor->host_ilist.begin(); + *jnum=this->nbor->host_acc.begin(); + + this->loop(eflag,vflag); + this->ans->copy_answers(eflag,vflag,eatom,vatom); + this->device->add_ans_object(this->ans); + this->hd_balancer.stop_timer(); + + return this->nbor->host_jlist.begin()-host_start; +} + +// --------------------------------------------------------------------------- +// Calculate per-atom energies and forces +// --------------------------------------------------------------------------- +template +void YukawaColloidT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->x, &dev_rad, &coeff, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &dev_rad, &coeff, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa); + } + this->time_pair.stop(); +} + +template class YukawaColloid; diff --git a/lib/gpu/lal_yukawa_colloid.cu b/lib/gpu/lal_yukawa_colloid.cu new file mode 100644 index 0000000000..4eb9bc87b2 --- /dev/null +++ b/lib/gpu/lal_yukawa_colloid.cu @@ -0,0 +1,202 @@ +// ************************************************************************** +// yukawa_colloid.cu +// ------------------- +// Trung Dac Nguyen (ORNL) +// +// Device code for acceleration of the yukawa/colloid pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : nguyentd@ornl.gov +// ***************************************************************************/ + +#ifdef NV_KERNEL + +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +texture pos_tex; +texture rad_tex; +#else +texture pos_tex; +texture rad_tex; +#endif + +#else +#define pos_tex x_ +#define rad_tex rad_ +#endif + +__kernel void k_yukawa_colloid(__global numtyp4 *x_, __global numtyp *rad_, + __global numtyp4 *coeff, const int lj_types, + __global numtyp *sp_lj_in, __global int *dev_nbor, + __global int *dev_packed, __global acctyp4 *ans, + __global acctyp *engv, const int eflag, + const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom, + const numtyp kappa) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[4]; + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + + acctyp energy=(acctyp)0; + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp virial[6]; + for (int i=0; i<6; i++) + virial[i]=(acctyp)0; + + if (ii0) { + numtyp e=coeff[mtype].x/kappa * screening; + energy+=factor_lj*(e-coeff[mtype].y); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + +__kernel void k_yukawa_colloid_fast(__global numtyp4 *x_, __global numtyp *rad_, + __global numtyp4 *coeff_in, __global numtyp *sp_lj_in, + __global int *dev_nbor, __global int *dev_packed, + __global acctyp4 *ans, __global acctyp *engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom, + const numtyp kappa) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + if (tid<4) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) { + numtyp e=coeff[mtype].x/kappa * screening; + energy+=factor_lj*(e-coeff[mtype].y); + } + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + } // if ii +} + diff --git a/lib/gpu/lal_yukawa_colloid.h b/lib/gpu/lal_yukawa_colloid.h new file mode 100644 index 0000000000..cd3cbb5cab --- /dev/null +++ b/lib/gpu/lal_yukawa_colloid.h @@ -0,0 +1,127 @@ +/*************************************************************************** + yukawa_colloid.h + ------------------- + Trung Dac Nguyen (ORNL) + + Class for acceleration of the yukawa/colloid pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifndef LAL_YUKAWA_COLLOID_H +#define LAL_YUKAWA_COLLOID_H + +#include "lal_base_atomic.h" + +namespace LAMMPS_AL { + +template +class YukawaColloid : public BaseAtomic { + public: + YukawaColloid(); + ~YukawaColloid(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_a, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, const double kappa); + + inline void cast_rad_data(double* rad) { + int nall = this->atom->nall(); + if (this->ucl_device->device_type()==UCL_CPU) { + if (sizeof(numtyp)==sizeof(double)) { + host_rad.view((numtyp*)rad,nall,*(this->ucl_device)); + dev_rad.view(host_rad); + } else { + for (int i=0; iatom->nall(),true); + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + /// Pair loop with host neighboring + void compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *rad); + + /// Pair loop with device neighboring + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *rad); + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture rad_tex; + + // --------------------------- TYPE DATA -------------------------- + + /// coeff.x = a, coeff.y = offset, coeff.z = cutsq + UCL_D_Vec coeff; + /// Special LJ values + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + int _max_rad_size; + + numtyp _kappa; + + /// Per-atom arrays + UCL_H_Vec host_rad; + UCL_D_Vec dev_rad; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_yukawa_colloid_ext.cpp b/lib/gpu/lal_yukawa_colloid_ext.cpp new file mode 100644 index 0000000000..c43166c33b --- /dev/null +++ b/lib/gpu/lal_yukawa_colloid_ext.cpp @@ -0,0 +1,123 @@ +/*************************************************************************** + yukawa_colloid_ext.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Functions for LAMMPS access to colloid acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "lal_yukawa_colloid.h" + +using namespace std; +using namespace LAMMPS_AL; + +static YukawaColloid YKCOLLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, + double **host_offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + const double kappa) { + YKCOLLMF.clear(); + gpu_mode=YKCOLLMF.device->gpu_mode(); + double gpu_split=YKCOLLMF.device->particle_split(); + int first_gpu=YKCOLLMF.device->first_device(); + int last_gpu=YKCOLLMF.device->last_device(); + int world_me=YKCOLLMF.device->world_me(); + int gpu_rank=YKCOLLMF.device->gpu_rank(); + int procs_per_gpu=YKCOLLMF.device->procs_per_gpu(); + + YKCOLLMF.device->init_message(screen,"yukawa/colloid",first_gpu,last_gpu); + + bool message=false; + if (YKCOLLMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, + inum, nall, 300, maxspecial, cell_size, gpu_split, + screen, kappa); + + YKCOLLMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + YKCOLLMF.estimate_gpu_overhead(); + return init_ok; +} + +void ykcolloid_gpu_clear() { + YKCOLLMF.clear(); +} + +int ** ykcolloid_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_rad) { + return YKCOLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_rad); +} + +void ykcolloid_gpu_compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_rad) { + YKCOLLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time, + success,host_rad); +} + +double ykcolloid_gpu_bytes() { + return YKCOLLMF.host_memory_usage(); +} + +