forked from lijiext/lammps
479 lines
18 KiB
C++
479 lines
18 KiB
C++
/***************************************************************************
|
|
ucl_d_mat.h
|
|
-------------------
|
|
W. Michael Brown
|
|
|
|
Matrix Container on Device
|
|
|
|
__________________________________________________________________________
|
|
This file is part of the Geryon Unified Coprocessor Library (UCL)
|
|
__________________________________________________________________________
|
|
|
|
begin : Thu Jun 25 2009
|
|
copyright : (C) 2009 by W. Michael Brown
|
|
email : brownw@ornl.gov
|
|
***************************************************************************/
|
|
|
|
/* -----------------------------------------------------------------------
|
|
Copyright (2009) Sandia Corporation. Under the terms of Contract
|
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
|
certain rights in this software. This software is distributed under
|
|
the Simplified BSD License.
|
|
----------------------------------------------------------------------- */
|
|
|
|
// Only allow this file to be included by CUDA and OpenCL specific headers
|
|
#ifdef _UCL_MAT_ALLOW
|
|
|
|
/// 2D Matrix on device (can have extra column storage to get correct alignment)
|
|
template <class numtyp>
|
|
class UCL_D_Mat : public UCL_BaseMat {
|
|
public:
|
|
// Traits for copying data
|
|
// MEM_TYPE is 0 for device, 1 for host, and 2 for image
|
|
enum traits {
|
|
DATA_TYPE = _UCL_DATA_ID<numtyp>::id,
|
|
MEM_TYPE = 0,
|
|
PADDED = 1,
|
|
ROW_MAJOR = 1,
|
|
VECTOR = 0
|
|
};
|
|
typedef numtyp data_type;
|
|
|
|
UCL_D_Mat() : _cols(0) {}
|
|
~UCL_D_Mat() { _device_free(*this); }
|
|
|
|
/// Construct with specified rows and cols
|
|
/** \sa alloc() **/
|
|
UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device,
|
|
const enum UCL_MEMOPT kind=UCL_READ_WRITE) :
|
|
_cols(0) { alloc(rows,cols,device,kind); }
|
|
|
|
/// Row major matrix on device
|
|
/** The kind parameter controls memory optimizations as follows:
|
|
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
|
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
|
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
|
* \param cq Default command queue for operations copied from another mat
|
|
* \note - Coalesced access using adjacent cols on same row
|
|
* UCL_D_Mat(row,col) given by array[row*row_size()+col]
|
|
* \return UCL_SUCCESS if the memory allocation is successful **/
|
|
template <class mat_type>
|
|
inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
|
|
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
|
clear();
|
|
|
|
int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
|
|
if (err!=UCL_SUCCESS) {
|
|
#ifndef UCL_NO_EXIT
|
|
std::cerr << "UCL Error: Could not allocate "
|
|
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
|
|
UCL_GERYON_EXIT;
|
|
#endif
|
|
return err;
|
|
}
|
|
|
|
_kind=kind;
|
|
_rows=rows;
|
|
_cols=cols;
|
|
_row_size=_pitch/sizeof(numtyp);
|
|
#ifndef _UCL_DEVICE_PTR_MAT
|
|
_end=_array+_row_size*cols;
|
|
#endif
|
|
#ifdef _OCL_MAT
|
|
_offset=0;
|
|
#endif
|
|
return err;
|
|
}
|
|
|
|
/// Row major matrix on device
|
|
/** The kind parameter controls memory optimizations as follows:
|
|
* - UCL_READ_WRITE - Specify that you will read and write in kernels
|
|
* - UCL_WRITE_ONLY - Specify that you will only write in kernels
|
|
* - UCL_READ_ONLY - Specify that you will only read in kernels
|
|
* \param device Used to get the default command queue for operations
|
|
* \note - Coalesced access using adjacent cols on same row
|
|
* UCL_D_Mat(row,col) given by array[row*row_size()+col]
|
|
* \return UCL_SUCCESS if the memory allocation is successful **/
|
|
inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
|
|
const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
|
|
clear();
|
|
|
|
int err=_device_alloc(*this,device,rows,cols,_pitch,kind);
|
|
if (err!=UCL_SUCCESS) {
|
|
#ifndef UCL_NO_EXIT
|
|
std::cerr << "UCL Error: Could not allocate "
|
|
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
|
|
UCL_GERYON_EXIT;
|
|
#endif
|
|
return err;
|
|
}
|
|
|
|
_kind=kind;
|
|
_rows=rows;
|
|
_cols=cols;
|
|
_row_size=_pitch/sizeof(numtyp);
|
|
#ifndef _UCL_DEVICE_PTR_MAT
|
|
_end=_array+_row_size*cols;
|
|
#endif
|
|
#ifdef _OCL_MAT
|
|
_offset=0;
|
|
#endif
|
|
return err;
|
|
}
|
|
|
|
/// Do not allocate memory, instead use an existing allocation from Geryon
|
|
/** This function must be passed a Geryon vector or matrix container.
|
|
* No memory is freed when the object is destructed.
|
|
* - The view does not prevent the memory from being freed by the
|
|
* allocating container when using CUDA APIs
|
|
* \param stride Number of _elements_ between the start of each row **/
|
|
template <class ucl_type>
|
|
inline void view(ucl_type &input, const size_t rows, const size_t cols,
|
|
const size_t stride) {
|
|
clear();
|
|
_kind=UCL_VIEW;
|
|
_rows=rows;
|
|
_cols=cols;
|
|
_pitch=stride*sizeof(numtyp);
|
|
_row_size=stride;
|
|
this->_cq=input.cq();
|
|
#ifdef _OCL_MAT
|
|
_offset=input.offset();
|
|
_array=input.cbegin();
|
|
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
|
|
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
|
|
#else
|
|
_device_view(&_array,input.begin());
|
|
#endif
|
|
|
|
#ifndef _UCL_DEVICE_PTR_MAT
|
|
_end=_array+_cols;
|
|
#endif
|
|
}
|
|
|
|
/// Do not allocate memory, instead use an existing allocation from Geryon
|
|
/** This function must be passed a Geryon vector or matrix container.
|
|
* No memory is freed when the object is destructed.
|
|
* - The view does not prevent the memory from being freed by the
|
|
* allocating container when using CUDA APIs **/
|
|
template <class ucl_type>
|
|
inline void view(ucl_type &input, const size_t rows, const size_t cols)
|
|
{ view(input,rows,cols,input.row_size()); }
|
|
|
|
/// Do not allocate memory, instead use an existing allocation from Geryon
|
|
/** This function must be passed a Geryon vector or matrix container.
|
|
* No memory is freed when the object is destructed.
|
|
* - The view does not prevent the memory from being freed by the
|
|
* allocating container when using CUDA APIs
|
|
* - If a matrix is used a input, all elements (including padding)
|
|
* will be used for view **/
|
|
template <class ucl_type>
|
|
inline void view(ucl_type &input, const size_t cols)
|
|
{ view(input,1,cols); }
|
|
|
|
/// Do not allocate memory, instead use an existing allocation from Geryon
|
|
/** This function must be passed a Geryon vector or matrix container.
|
|
* No memory is freed when the object is destructed.
|
|
* - The view does not prevent the memory from being freed by the
|
|
* allocating container when using CUDA APIs
|
|
* - If a matrix is used a input, all elements (including padding)
|
|
* will be used for view **/
|
|
template <class ucl_type>
|
|
inline void view(ucl_type &input)
|
|
{ view(input,input.rows(),input.cols()); }
|
|
|
|
/// Do not allocate memory, instead use an existing allocation
|
|
/** - No memory is freed when the object is destructed.
|
|
* - The view does not prevent the memory from being freed by the
|
|
* allocating container when using CUDA APIs
|
|
* \param stride Number of _elements_ between the start of each row **/
|
|
template <class ptr_type>
|
|
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
|
const size_t stride, UCL_Device &dev) {
|
|
clear();
|
|
_kind=UCL_VIEW;
|
|
_cols=cols;
|
|
_rows=rows;
|
|
_pitch=stride*sizeof(numtyp);
|
|
_row_size=stride;
|
|
this->_cq=dev.cq();
|
|
_array=input;
|
|
#ifndef _UCL_DEVICE_PTR_MAT
|
|
_end=_array+_cols;
|
|
#endif
|
|
#ifdef _OCL_MAT
|
|
_offset=0;
|
|
CL_SAFE_CALL(clRetainMemObject(input));
|
|
CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
|
|
#endif
|
|
}
|
|
|
|
/// Do not allocate memory, instead use an existing allocation
|
|
/** - No memory is freed when the object is destructed.
|
|
* - The view does not prevent the memory from being freed by the
|
|
* allocating container when using CUDA APIs **/
|
|
template <class ptr_type>
|
|
inline void view(ptr_type input, const size_t rows, const size_t cols,
|
|
UCL_Device &dev) { view(input,rows,cols,cols,dev); }
|
|
|
|
/// Do not allocate memory, instead use an existing allocation
|
|
/** - No memory is freed when the object is destructed.
|
|
* - The view does not prevent the memory from being freed by the
|
|
* allocating container when using CUDA APIs **/
|
|
template <class ptr_type>
|
|
inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
|
|
{ view(input,1,cols,dev); }
|
|
|
|
/// Do not allocate memory, instead use an existing allocation from Geryon
|
|
/** This function must be passed a Geryon vector or matrix container.
|
|
* No memory is freed when the object is destructed.
|
|
* - The view does not prevent the memory from being freed by the
|
|
* allocating container when using CUDA APIs
|
|
* \param stride Number of _elements_ between the start of each row **/
|
|
template <class ucl_type>
|
|
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
|
const size_t cols, const size_t stride) {
|
|
clear();
|
|
_kind=UCL_VIEW;
|
|
_cols=cols;
|
|
_rows=rows;
|
|
_pitch=stride*sizeof(numtyp);
|
|
_row_size=stride;
|
|
this->_cq=input.cq();
|
|
#ifdef _OCL_MAT
|
|
_array=input.begin();
|
|
_offset=offset+input.offset();
|
|
CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
|
|
CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
|
|
#else
|
|
_device_view(&_array,input.begin(),offset,sizeof(numtyp));
|
|
#endif
|
|
|
|
#ifndef _UCL_DEVICE_PTR_MAT
|
|
_end=_array+_cols;
|
|
#endif
|
|
}
|
|
|
|
/// Do not allocate memory, instead use an existing allocation from Geryon
|
|
/** This function must be passed a Geryon vector or matrix container.
|
|
* No memory is freed when the object is destructed.
|
|
* - The view does not prevent the memory from being freed by the
|
|
* allocating container when using CUDA APIs **/
|
|
template <class ucl_type>
|
|
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
|
const size_t cols)
|
|
{ view_offset(offset,input,rows,cols,input.row_size()); }
|
|
|
|
/// Do not allocate memory, instead use an existing allocation from Geryon
|
|
/** This function must be passed a Geryon vector or matrix container.
|
|
* No memory is freed when the object is destructed.
|
|
* - The view does not prevent the memory from being freed by the
|
|
* allocating container when using CUDA APIs
|
|
* - If a matrix is used a input, all elements (including padding)
|
|
* will be used for view **/
|
|
template <class ucl_type>
|
|
inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
|
|
{ view_offset(offset,input,1,cols); }
|
|
|
|
/// Do not allocate memory, instead use an existing allocation from Geryon
|
|
/** This function must be passed a Geryon vector or matrix container.
|
|
* No memory is freed when the object is destructed.
|
|
* - The view does not prevent the memory from being freed by the
|
|
* allocating container when using CUDA APIs
|
|
* - If a matrix is used a input, all elements (including padding)
|
|
* will be used for view **/
|
|
template <class ucl_type>
|
|
inline void view_offset(const size_t offset, ucl_type &input) {
|
|
if (input.rows()==1)
|
|
view_offset(offset,input,1,input.cols()-offset);
|
|
else
|
|
view_offset(offset,input,input.rows()-offset/input.row_size(),
|
|
input.cols());
|
|
}
|
|
|
|
/// Do not allocate memory, instead use an existing allocation
|
|
/** - No memory is freed when the object is destructed.
|
|
* - The view does not prevent the memory from being freed by the
|
|
* allocating container when using CUDA APIs
|
|
* \param stride Number of _elements_ between the start of each row **/
|
|
template <class ptr_type>
|
|
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
|
const size_t cols,const size_t stride,
|
|
UCL_Device &dev) {
|
|
clear();
|
|
_kind=UCL_VIEW;
|
|
_cols=cols;
|
|
_rows=rows;
|
|
_pitch=stride*sizeof(numtyp);
|
|
_row_size=stride;
|
|
this->_cq=dev.cq();
|
|
|
|
#ifdef _OCL_MAT
|
|
_array=input;
|
|
_offset=offset;
|
|
CL_SAFE_CALL(clRetainMemObject(input));
|
|
CL_SAFE_CALL(clRetainCommandQueue(dev.cq()));
|
|
#else
|
|
#ifdef _UCL_DEVICE_PTR_MAT
|
|
_array=input+offset*sizeof(numtyp);
|
|
#else
|
|
_array=input+offset;
|
|
#endif
|
|
#endif
|
|
|
|
#ifndef _UCL_DEVICE_PTR_MAT
|
|
_end=_array+_cols;
|
|
#endif
|
|
}
|
|
|
|
/// Do not allocate memory, instead use an existing allocation
|
|
/** - No memory is freed when the object is destructed.
|
|
* - The view does not prevent the memory from being freed by the
|
|
* allocating container when using CUDA APIs **/
|
|
template <class ptr_type>
|
|
inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
|
|
const size_t cols, UCL_Device &dev)
|
|
{ view_offset(offset,input,rows,cols,cols,dev); }
|
|
|
|
/// Do not allocate memory, instead use an existing allocation
|
|
/** - No memory is freed when the object is destructed.
|
|
* - The view does not prevent the memory from being freed by the
|
|
* allocating container when using CUDA APIs **/
|
|
template <class ptr_type>
|
|
inline void view_offset(const size_t offset, ptr_type input,
|
|
const size_t cols, UCL_Device &dev)
|
|
{ view_offset(offset,input,1,cols,dev); }
|
|
|
|
/// Free memory and set size to 0
|
|
inline void clear()
|
|
{ _device_free(*this); _cols=0; _kind=UCL_VIEW; }
|
|
|
|
/// Resize the allocation to contain cols elements
|
|
/** \note Cannot be used on views **/
|
|
inline int resize(const int rows, const int cols) {
|
|
assert(_kind!=UCL_VIEW);
|
|
|
|
int err=_device_resize(*this,rows,cols,_pitch);
|
|
if (err!=UCL_SUCCESS) {
|
|
#ifndef UCL_NO_EXIT
|
|
std::cerr << "UCL Error: Could not allocate "
|
|
<< rows*cols*sizeof(numtyp) << " bytes on device.\n";
|
|
UCL_GERYON_EXIT;
|
|
#endif
|
|
return err;
|
|
}
|
|
|
|
_rows=rows;
|
|
_cols=cols;
|
|
_row_size=_pitch/sizeof(numtyp);
|
|
#ifndef _UCL_DEVICE_PTR_MAT
|
|
_end=_array+_row_size*cols;
|
|
#endif
|
|
#ifdef _OCL_MAT
|
|
_offset=0;
|
|
#endif
|
|
return err;
|
|
}
|
|
|
|
/// Resize (only if bigger) the allocation to contain rows x cols elements
|
|
/** \note Cannot be used on views **/
|
|
inline int resize_ib(const int rows, const int cols)
|
|
{ if (cols>_cols || rows>_rows) return resize(rows,cols);
|
|
else return UCL_SUCCESS; }
|
|
|
|
/// Set each element to zero asynchronously in the default command_queue
|
|
inline void zero() { zero(_cq); }
|
|
/// Set first n elements to zero asynchronously in the default command_queue
|
|
inline void zero(const int n) { zero(n,_cq); }
|
|
/// Set each element to zero asynchronously
|
|
inline void zero(command_queue &cq)
|
|
{ _device_zero(*this,row_bytes()*_rows,cq); }
|
|
/// Set first n elements to zero asynchronously
|
|
inline void zero(const int n, command_queue &cq)
|
|
{ _device_zero(*this,n*sizeof(numtyp),cq); }
|
|
|
|
|
|
#ifdef _UCL_DEVICE_PTR_MAT
|
|
/// For OpenCL, returns a (void *) device pointer to memory allocation
|
|
inline device_ptr & begin() { return _array; }
|
|
/// For OpenCL, returns a (void *) device pointer to memory allocation
|
|
inline const device_ptr & begin() const { return _array; }
|
|
#else
|
|
/// For CUDA-RT, get device pointer to first element
|
|
inline numtyp * & begin() { return _array; }
|
|
/// For CUDA-RT, get device pointer to first element
|
|
inline numtyp * const & begin() const { return _array; }
|
|
/// For CUDA-RT, get device pointer to one past last element
|
|
inline numtyp * end() { return _end; }
|
|
/// For CUDA-RT, get device pointer to one past last element
|
|
inline const numtyp * end() const { return _end; }
|
|
#endif
|
|
|
|
#ifdef _UCL_DEVICE_PTR_MAT
|
|
/// Returns an API specific device pointer
|
|
/** - For OpenCL, returns a &cl_mem object
|
|
* - For CUDA Driver, returns a &CUdeviceptr
|
|
* - For CUDA-RT, returns void** **/
|
|
inline device_ptr & cbegin() { return _array; }
|
|
/// Returns an API specific device pointer
|
|
/** - For OpenCL, returns a &cl_mem object
|
|
* - For CUDA Driver, returns a &CUdeviceptr
|
|
* - For CUDA-RT, returns void** **/
|
|
inline const device_ptr & cbegin() const { return _array; }
|
|
#else
|
|
/// Returns an API specific device pointer
|
|
/** - For OpenCL, returns a &cl_mem object
|
|
* - For CUDA Driver, returns a &CUdeviceptr
|
|
* - For CUDA-RT, returns numtyp** **/
|
|
inline numtyp ** cbegin() { return &_array; }
|
|
/// Returns an API specific device pointer
|
|
/** - For OpenCL, returns a &cl_mem object
|
|
* - For CUDA Driver, returns a &CUdeviceptr
|
|
* - For CUDA-RT, returns numtyp** **/
|
|
inline const numtyp ** cbegin() const { return &_array; }
|
|
#endif
|
|
|
|
/// Get the number of elements
|
|
inline size_t numel() const { return _cols*_rows; }
|
|
/// Get the number of rows
|
|
inline size_t rows() const { return _rows; }
|
|
/// Get the number of columns
|
|
inline size_t cols() const { return _cols; }
|
|
///Get the size of a row (including any padding) in elements
|
|
inline size_t row_size() const { return _row_size; }
|
|
/// Get the size of a row (including any padding) in bytes
|
|
inline size_t row_bytes() const { return _pitch; }
|
|
/// Get the size in bytes of 1 element
|
|
inline int element_size() const { return sizeof(numtyp); }
|
|
|
|
#ifdef _OCL_MAT
|
|
/// Return the offset (in elements) from begin() pointer where data starts
|
|
/** \note Always 0 for host matrices and CUDA APIs **/
|
|
inline size_t offset() const { return _offset; }
|
|
#else
|
|
/// Return the offset (in elements) from begin() pointer where data starts
|
|
/** \note Always 0 for host matrices and CUDA APIs **/
|
|
inline size_t offset() const { return 0; }
|
|
#endif
|
|
|
|
/// Return the offset (in bytes) from begin() pointer where data starts
|
|
/** \note Always 0 for host matrices and CUDA APIs **/
|
|
inline size_t byteoff() const { return offset()*sizeof(numtyp); }
|
|
|
|
private:
|
|
size_t _pitch, _row_size, _rows, _cols;
|
|
|
|
#ifdef _UCL_DEVICE_PTR_MAT
|
|
device_ptr _array;
|
|
#else
|
|
numtyp *_array,*_end;
|
|
#endif
|
|
|
|
#ifdef _OCL_MAT
|
|
size_t _offset;
|
|
#endif
|
|
};
|
|
|
|
#endif
|
|
|