forked from lijiext/lammps
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7885 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
17091c755e
commit
d30ba2d7eb
|
@ -1,2 +1 @@
|
|||
Geryon Version 11.094
|
||||
|
||||
Geryon Version 12.034
|
||||
|
|
|
@ -117,6 +117,14 @@ class UCL_Device {
|
|||
_cq.pop_back();
|
||||
}
|
||||
|
||||
/// Set the default command queue (by default this is the null stream)
|
||||
/** \param i index of the command queue (as added by push_command_queue())
|
||||
If i is 0, the default command queue is set to the null stream **/
|
||||
inline void set_command_queue(const int i) {
|
||||
if (i==0) _cq[0]=0;
|
||||
else _cq[0]=_cq[i];
|
||||
}
|
||||
|
||||
/// Get the current CUDA device name
|
||||
inline std::string name() { return name(_device); }
|
||||
/// Get the CUDA device name
|
||||
|
@ -280,6 +288,7 @@ inline int UCL_Device::set(int num) {
|
|||
if (_device>-1) {
|
||||
CU_SAFE_CALL_NS(cuCtxDestroy(_context));
|
||||
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
||||
_cq[0]=0;
|
||||
}
|
||||
_device=_properties[num].device_id;
|
||||
CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
|
||||
|
|
|
@ -34,11 +34,11 @@ class UCL_Texture;
|
|||
/// Class storing 1 or more kernel functions from a single string or file
|
||||
class UCL_Program {
|
||||
public:
|
||||
inline UCL_Program(UCL_Device &device) {}
|
||||
inline UCL_Program(UCL_Device &device) { _cq=device.cq(); }
|
||||
inline ~UCL_Program() {}
|
||||
|
||||
/// Initialize the program with a device
|
||||
inline void init(UCL_Device &device) { }
|
||||
inline void init(UCL_Device &device) { _cq=device.cq(); }
|
||||
|
||||
/// Clear any data associated with program
|
||||
/** \note Must call init() after each clear **/
|
||||
|
@ -130,6 +130,7 @@ class UCL_Program {
|
|||
friend class UCL_Kernel;
|
||||
private:
|
||||
CUmodule _module;
|
||||
CUstream _cq;
|
||||
friend class UCL_Texture;
|
||||
};
|
||||
|
||||
|
@ -141,7 +142,7 @@ class UCL_Kernel {
|
|||
|
||||
UCL_Kernel(UCL_Program &program, const char *function) :
|
||||
_dimensions(1), _num_args(0), _param_size(0)
|
||||
{ _num_blocks[0]=0; set_function(program,function); }
|
||||
{ _num_blocks[0]=0; set_function(program,function); _cq=program._cq; }
|
||||
|
||||
~UCL_Kernel() {}
|
||||
|
||||
|
@ -160,6 +161,7 @@ class UCL_Kernel {
|
|||
#endif
|
||||
return UCL_FUNCTION_NOT_FOUND;
|
||||
}
|
||||
_cq=program._cq;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -229,7 +231,7 @@ class UCL_Kernel {
|
|||
/// Run the kernel in the default command queue
|
||||
inline void run() {
|
||||
CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
|
||||
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],0));
|
||||
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
|
||||
}
|
||||
|
||||
/// Run the kernel in the specified command queue
|
||||
|
@ -245,6 +247,7 @@ class UCL_Kernel {
|
|||
|
||||
private:
|
||||
CUfunction _kernel;
|
||||
CUstream _cq;
|
||||
unsigned _dimensions;
|
||||
unsigned _num_blocks[2];
|
||||
unsigned _num_args;
|
||||
|
|
|
@ -57,6 +57,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
|||
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
||||
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
||||
return UCL_MEMORY_ERROR;
|
||||
mat.cq()=cm.cq();
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -72,6 +73,7 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
|||
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
|
||||
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
|
||||
return UCL_MEMORY_ERROR;
|
||||
mat.cq()=dev.cq();
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -92,6 +94,7 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
|||
CUresult err=cuMemAlloc(&mat.cbegin(),n);
|
||||
if (err!=CUDA_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
mat.cq()=cm.cq();
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -101,6 +104,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
|||
CUresult err=cuMemAlloc(&mat.cbegin(),n);
|
||||
if (err!=CUDA_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
mat.cq()=dev.cq();
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -115,6 +119,7 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
|
|||
pitch=static_cast<size_t>(upitch);
|
||||
if (err!=CUDA_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
mat.cq()=cm.cq();
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -129,6 +134,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
|
|||
pitch=static_cast<size_t>(upitch);
|
||||
if (err!=CUDA_SUCCESS)
|
||||
return UCL_MEMORY_ERROR;
|
||||
mat.cq()=d.cq();
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -243,8 +249,8 @@ template<> struct _ucl_memcpy<2,2> {
|
|||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstArray=dst.cbegin();
|
||||
ins.srcArray=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
|
@ -255,8 +261,8 @@ template<> struct _ucl_memcpy<2,2> {
|
|||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstArray=dst.cbegin();
|
||||
ins.srcArray=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
|
@ -280,8 +286,8 @@ template<> struct _ucl_memcpy<2,0> {
|
|||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstArray=dst.cbegin();
|
||||
ins.srcDevice=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
|
@ -292,8 +298,8 @@ template<> struct _ucl_memcpy<2,0> {
|
|||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstArray=dst.cbegin();
|
||||
ins.srcDevice=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
|
@ -317,8 +323,8 @@ template<> struct _ucl_memcpy<2,1> {
|
|||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstArray=dst.cbegin();
|
||||
ins.srcHost=src.begin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
|
@ -329,8 +335,8 @@ template<> struct _ucl_memcpy<2,1> {
|
|||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstArray=dst.cbegin();
|
||||
ins.srcHost=src.begin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
|
@ -354,8 +360,8 @@ template<> struct _ucl_memcpy<0,2> {
|
|||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstDevice=dst.cbegin();
|
||||
ins.srcArray=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
|
@ -366,8 +372,8 @@ template<> struct _ucl_memcpy<0,2> {
|
|||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstDevice=dst.cbegin();
|
||||
ins.srcArray=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
|
@ -391,8 +397,8 @@ template<> struct _ucl_memcpy<1,2> {
|
|||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstHost=dst.begin();
|
||||
ins.srcArray=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
|
@ -403,8 +409,8 @@ template<> struct _ucl_memcpy<1,2> {
|
|||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstHost=dst.begin();
|
||||
ins.srcArray=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
|
@ -428,8 +434,8 @@ template <> struct _ucl_memcpy<1,0> {
|
|||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstHost=dst.begin();
|
||||
ins.srcDevice=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
|
@ -440,8 +446,8 @@ template <> struct _ucl_memcpy<1,0> {
|
|||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstHost=dst.begin();
|
||||
ins.srcDevice=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
|
@ -465,8 +471,8 @@ template <> struct _ucl_memcpy<0,1> {
|
|||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstDevice=dst.cbegin();
|
||||
ins.srcHost=src.begin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
|
@ -477,8 +483,8 @@ template <> struct _ucl_memcpy<0,1> {
|
|||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstDevice=dst.cbegin();
|
||||
ins.srcHost=src.begin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
|
@ -500,8 +506,8 @@ template <> struct _ucl_memcpy<1,1> {
|
|||
const size_t rows) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstHost=dst.begin();
|
||||
ins.srcHost=src.begin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
|
@ -512,8 +518,8 @@ template <> struct _ucl_memcpy<1,1> {
|
|||
const size_t rows, CUstream &cq) {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstHost=dst.begin();
|
||||
ins.srcHost=src.begin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
|
@ -529,7 +535,7 @@ template <int mem1, int mem2> struct _ucl_memcpy {
|
|||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
||||
CUstream &cq) {
|
||||
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin(),src.cbegin(),n));
|
||||
CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin(),src.cbegin(),n,cq));
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
|
@ -546,8 +552,8 @@ template <int mem1, int mem2> struct _ucl_memcpy {
|
|||
} else {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstDevice=dst.cbegin();
|
||||
ins.srcDevice=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2D(&ins));
|
||||
|
@ -560,16 +566,16 @@ template <int mem1, int mem2> struct _ucl_memcpy {
|
|||
if (p1::PADDED==0 || p2::PADDED==0) {
|
||||
size_t src_offset=0, dst_offset=0;
|
||||
for (size_t i=0; i<rows; i++) {
|
||||
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset,
|
||||
src.cbegin()+src_offset,cols));
|
||||
CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin()+dst_offset,
|
||||
src.cbegin()+src_offset,cols,cq));
|
||||
src_offset+=spitch;
|
||||
dst_offset+=dpitch;
|
||||
}
|
||||
} else {
|
||||
CUDA_MEMCPY2D ins;
|
||||
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
|
||||
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
|
||||
ins.dstDevice=dst.cbegin();
|
||||
ins.srcDevice=src.cbegin();
|
||||
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
|
||||
|
|
|
@ -124,16 +124,25 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica,
|
|||
if (my_gpu>=gpu->num_devices())
|
||||
return -2;
|
||||
|
||||
#ifndef CUDA_PRX
|
||||
if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
|
||||
return -7;
|
||||
#endif
|
||||
|
||||
if (gpu->set(my_gpu)!=UCL_SUCCESS)
|
||||
return -6;
|
||||
|
||||
gpu->push_command_queue();
|
||||
gpu->set_command_queue(1);
|
||||
|
||||
_long_range_precompute=0;
|
||||
|
||||
int flag=compile_kernels();
|
||||
|
||||
int flag=0;
|
||||
for (int i=0; i<_procs_per_gpu; i++) {
|
||||
if (_gpu_rank==i)
|
||||
flag=compile_kernels();
|
||||
gpu_barrier();
|
||||
}
|
||||
return flag;
|
||||
}
|
||||
|
||||
|
|
|
@ -348,8 +348,9 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
|
|||
// copy fp from device to host for comm
|
||||
_nlocal=nlocal;
|
||||
time_fp1.start();
|
||||
ucl_copy(host_fp,dev_fp,nlocal,false);
|
||||
ucl_copy(host_fp,dev_fp,nlocal,true);
|
||||
time_fp1.stop();
|
||||
time_fp1.sync_stop();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
@ -427,8 +428,9 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
|
|||
// copy fp from device to host for comm
|
||||
_nlocal=inum_full;
|
||||
time_fp1.start();
|
||||
ucl_copy(host_fp,dev_fp,inum_full,false);
|
||||
ucl_copy(host_fp,dev_fp,inum_full,true);
|
||||
time_fp1.stop();
|
||||
time_fp1.sync_stop();
|
||||
|
||||
return this->nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
|
|
@ -69,10 +69,12 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
|
|||
time_kernel.init(*dev);
|
||||
time_hybrid1.init(*dev);
|
||||
time_hybrid2.init(*dev);
|
||||
time_transpose.init(*dev);
|
||||
time_nbor.zero();
|
||||
time_kernel.zero();
|
||||
time_hybrid1.zero();
|
||||
time_hybrid2.zero();
|
||||
time_transpose.zero();
|
||||
|
||||
_max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
|
||||
if (_max_atoms==0)
|
||||
|
@ -203,6 +205,7 @@ void Neighbor::clear() {
|
|||
time_nbor.clear();
|
||||
time_hybrid1.clear();
|
||||
time_hybrid2.clear();
|
||||
time_transpose.clear();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -285,6 +288,41 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
|
|||
}
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
|
||||
if (maxn>_max_nbors) {
|
||||
int mn=static_cast<int>(static_cast<double>(maxn)*1.10);
|
||||
dev_nbor.clear();
|
||||
success=success &&
|
||||
(dev_nbor.alloc((mn+1)*_max_atoms,*dev)==UCL_SUCCESS);
|
||||
_gpu_bytes=dev_nbor.row_bytes();
|
||||
if (_max_host>0) {
|
||||
host_nbor.clear();
|
||||
dev_host_nbor.clear();
|
||||
success=success && (host_nbor.alloc(mn*_max_host,*dev,
|
||||
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
|
||||
success=success && (dev_host_nbor.alloc(mn*_max_host,
|
||||
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
int *ptr=host_nbor.begin();
|
||||
for (int i=0; i<_max_host; i++) {
|
||||
host_jlist[i]=ptr;
|
||||
ptr+=mn;
|
||||
}
|
||||
_gpu_bytes+=dev_host_nbor.row_bytes();
|
||||
} else {
|
||||
dev_host_nbor.view(dev_nbor);
|
||||
dev_host_numj.view(dev_nbor);
|
||||
}
|
||||
if (_alloc_packed) {
|
||||
dev_packed.clear();
|
||||
success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
_gpu_bytes+=dev_packed.row_bytes();
|
||||
}
|
||||
_max_nbors=mn;
|
||||
}
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
||||
const int nall, Atom<numtyp,acctyp> &atom,
|
||||
|
@ -320,6 +358,29 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||
|
||||
const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
|
||||
|
||||
if (_maxspecial>0) {
|
||||
time_nbor.start();
|
||||
UCL_H_Vec<int> view_nspecial, view_special, view_tag;
|
||||
view_nspecial.view(nspecial[0],nt*3,*dev);
|
||||
view_special.view(special[0],nt*_maxspecial,*dev);
|
||||
view_tag.view(tag,nall,*dev);
|
||||
ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
|
||||
ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
|
||||
ucl_copy(atom.dev_tag,view_tag,nall,false);
|
||||
time_nbor.stop();
|
||||
if (_time_device)
|
||||
time_nbor.add_to_total();
|
||||
time_transpose.start();
|
||||
const int b2x=_block_cell_2d;
|
||||
const int b2y=_block_cell_2d;
|
||||
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
|
||||
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
|
||||
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
|
||||
_shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
|
||||
&_maxspecial,&nt);
|
||||
time_transpose.stop();
|
||||
}
|
||||
|
||||
// If binning on CPU, do this now
|
||||
if (_gpu_nbor==2) {
|
||||
double stime = MPI_Wtime();
|
||||
|
@ -352,6 +413,16 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||
cell_id[i]=id;
|
||||
host_cell_counts[id+1]++;
|
||||
}
|
||||
|
||||
mn=0;
|
||||
for (int i=0; i<_ncells; i++)
|
||||
mn=std::max(mn,host_cell_counts[i]);
|
||||
mn*=8;
|
||||
resize_max_neighbors<numtyp,acctyp>(mn,success);
|
||||
if (!success)
|
||||
return;
|
||||
_total_atoms=nt;
|
||||
|
||||
cell_iter[0]=0;
|
||||
for (int i=1; i<_ncells; i++) {
|
||||
host_cell_counts[i]+=host_cell_counts[i-1];
|
||||
|
@ -372,28 +443,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||
_bin_time+=MPI_Wtime()-stime;
|
||||
}
|
||||
|
||||
if (_maxspecial>0) {
|
||||
time_nbor.start();
|
||||
UCL_H_Vec<int> view_nspecial, view_special, view_tag;
|
||||
view_nspecial.view(nspecial[0],nt*3,*dev);
|
||||
view_special.view(special[0],nt*_maxspecial,*dev);
|
||||
view_tag.view(tag,nall,*dev);
|
||||
ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
|
||||
ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
|
||||
ucl_copy(atom.dev_tag,view_tag,nall,false);
|
||||
time_nbor.stop();
|
||||
if (_time_device)
|
||||
time_nbor.add_to_total();
|
||||
time_kernel.start();
|
||||
const int b2x=_block_cell_2d;
|
||||
const int b2y=_block_cell_2d;
|
||||
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
|
||||
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
|
||||
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
|
||||
_shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
|
||||
&_maxspecial,&nt);
|
||||
} else
|
||||
time_kernel.start();
|
||||
time_kernel.start();
|
||||
|
||||
_nbor_pitch=inum;
|
||||
_shared->neigh_tex.bind_float(atom.dev_x,4);
|
||||
|
@ -435,54 +485,30 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||
/* Get the maximum number of nbors and realloc if necessary */
|
||||
UCL_D_Vec<int> numj;
|
||||
numj.view_offset(inum,dev_nbor,inum);
|
||||
ucl_copy(host_acc,numj,inum,false);
|
||||
ucl_copy(host_acc,numj,inum,true);
|
||||
if (nt>inum) {
|
||||
UCL_H_Vec<int> host_offset;
|
||||
host_offset.view_offset(inum,host_acc,nt-inum);
|
||||
ucl_copy(host_offset,dev_host_numj,nt-inum,false);
|
||||
ucl_copy(host_offset,dev_host_numj,nt-inum,true);
|
||||
}
|
||||
mn=host_acc[0];
|
||||
for (int i=1; i<nt; i++)
|
||||
mn=std::max(mn,host_acc[i]);
|
||||
|
||||
if (_gpu_nbor!=2) {
|
||||
host_acc.sync();
|
||||
mn=host_acc[0];
|
||||
for (int i=1; i<nt; i++)
|
||||
mn=std::max(mn,host_acc[i]);
|
||||
|
||||
if (mn>_max_nbors) {
|
||||
mn=static_cast<int>(static_cast<double>(mn)*1.10);
|
||||
dev_nbor.clear();
|
||||
success=success &&
|
||||
(dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_x)==UCL_SUCCESS);
|
||||
_gpu_bytes=dev_nbor.row_bytes();
|
||||
if (_max_host>0) {
|
||||
host_nbor.clear();
|
||||
dev_host_nbor.clear();
|
||||
success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
|
||||
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
|
||||
success=success && (dev_host_nbor.alloc(mn*_max_host,
|
||||
dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
int *ptr=host_nbor.begin();
|
||||
for (int i=0; i<_max_host; i++) {
|
||||
host_jlist[i]=ptr;
|
||||
ptr+=mn;
|
||||
}
|
||||
_gpu_bytes+=dev_host_nbor.row_bytes();
|
||||
} else {
|
||||
dev_host_nbor.view(dev_nbor);
|
||||
dev_host_numj.view(dev_nbor);
|
||||
}
|
||||
if (_alloc_packed) {
|
||||
dev_packed.clear();
|
||||
success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
_gpu_bytes+=dev_packed.row_bytes();
|
||||
}
|
||||
if (!success)
|
||||
if (mn>_max_nbors) {
|
||||
resize_max_neighbors<numtyp,acctyp>(mn,success);
|
||||
if (!success)
|
||||
return;
|
||||
time_kernel.stop();
|
||||
if (_time_device)
|
||||
time_kernel.add_to_total();
|
||||
build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag,
|
||||
nspecial, special, success, mn);
|
||||
return;
|
||||
_max_nbors=mn;
|
||||
time_kernel.stop();
|
||||
if (_time_device)
|
||||
time_kernel.add_to_total();
|
||||
build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag, nspecial,
|
||||
special, success, mn);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (_maxspecial>0) {
|
||||
|
@ -497,8 +523,10 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||
time_kernel.stop();
|
||||
|
||||
time_nbor.start();
|
||||
if (inum<nt)
|
||||
ucl_copy(host_nbor,dev_host_nbor,false);
|
||||
if (inum<nt) {
|
||||
ucl_copy(host_nbor,dev_host_nbor,true);
|
||||
host_nbor.sync();
|
||||
}
|
||||
time_nbor.stop();
|
||||
}
|
||||
|
||||
|
|
|
@ -106,13 +106,24 @@ class Neighbor {
|
|||
|
||||
inline void acc_timers() {
|
||||
if (_nbor_time_avail) {
|
||||
time_nbor.add_to_total();
|
||||
time_kernel.add_to_total();
|
||||
if (_gpu_nbor==2) {
|
||||
time_hybrid1.add_to_total();
|
||||
time_hybrid2.add_to_total();
|
||||
int mn=0;
|
||||
for (int i=0; i<_total_atoms; i++)
|
||||
mn=std::max(mn,host_acc[i]);
|
||||
if (mn>_max_nbors)
|
||||
assert(0==1);
|
||||
}
|
||||
if (_time_device) {
|
||||
time_nbor.add_to_total();
|
||||
time_kernel.add_to_total();
|
||||
if (_gpu_nbor==2) {
|
||||
time_hybrid1.add_to_total();
|
||||
time_hybrid2.add_to_total();
|
||||
}
|
||||
if (_maxspecial>0)
|
||||
time_transpose.add_to_total();
|
||||
_nbor_time_avail=false;
|
||||
}
|
||||
_nbor_time_avail=false;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -213,7 +224,7 @@ class Neighbor {
|
|||
UCL_D_Vec<int> dev_cell_counts;
|
||||
|
||||
/// Device timers
|
||||
UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2;
|
||||
UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose;
|
||||
|
||||
private:
|
||||
NeighborShared *_shared;
|
||||
|
@ -228,6 +239,10 @@ class Neighbor {
|
|||
|
||||
int _block_cell_2d, _block_cell_id, _block_nbor_build, _ncells;
|
||||
int _threads_per_atom;
|
||||
int _total_atoms;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
inline void resize_max_neighbors(const int maxn, bool &success);
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -85,7 +85,7 @@ __kernel void kernel_calc_cell_counts(unsigned *cell_id,
|
|||
__kernel void transpose(__global int *out, __global int *in, int columns_in,
|
||||
int rows_in)
|
||||
{
|
||||
__local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
|
||||
__local int block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
|
||||
|
||||
unsigned ti=THREAD_ID_X;
|
||||
unsigned tj=THREAD_ID_Y;
|
||||
|
|
Loading…
Reference in New Issue