git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7885 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2012-03-02 15:57:05 +00:00
parent 17091c755e
commit d30ba2d7eb
9 changed files with 193 additions and 122 deletions

View File

@ -1,2 +1 @@
Geryon Version 11.094
Geryon Version 12.034

View File

@ -117,6 +117,14 @@ class UCL_Device {
_cq.pop_back();
}
/// Set the default command queue (by default this is the null stream)
/** \param i index of the command queue (as added by push_command_queue())
If i is 0, the default command queue is set to the null stream **/
inline void set_command_queue(const int i) {
if (i==0) _cq[0]=0;
else _cq[0]=_cq[i];
}
/// Get the current CUDA device name
inline std::string name() { return name(_device); }
/// Get the CUDA device name
@ -280,6 +288,7 @@ inline int UCL_Device::set(int num) {
if (_device>-1) {
CU_SAFE_CALL_NS(cuCtxDestroy(_context));
for (int i=1; i<num_queues(); i++) pop_command_queue();
_cq[0]=0;
}
_device=_properties[num].device_id;
CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));

View File

@ -34,11 +34,11 @@ class UCL_Texture;
/// Class storing 1 or more kernel functions from a single string or file
class UCL_Program {
public:
inline UCL_Program(UCL_Device &device) {}
inline UCL_Program(UCL_Device &device) { _cq=device.cq(); }
inline ~UCL_Program() {}
/// Initialize the program with a device
inline void init(UCL_Device &device) { }
inline void init(UCL_Device &device) { _cq=device.cq(); }
/// Clear any data associated with program
/** \note Must call init() after each clear **/
@ -130,6 +130,7 @@ class UCL_Program {
friend class UCL_Kernel;
private:
CUmodule _module;
CUstream _cq;
friend class UCL_Texture;
};
@ -141,7 +142,7 @@ class UCL_Kernel {
UCL_Kernel(UCL_Program &program, const char *function) :
_dimensions(1), _num_args(0), _param_size(0)
{ _num_blocks[0]=0; set_function(program,function); }
{ _num_blocks[0]=0; set_function(program,function); _cq=program._cq; }
~UCL_Kernel() {}
@ -160,6 +161,7 @@ class UCL_Kernel {
#endif
return UCL_FUNCTION_NOT_FOUND;
}
_cq=program._cq;
return UCL_SUCCESS;
}
@ -229,7 +231,7 @@ class UCL_Kernel {
/// Run the kernel in the default command queue
inline void run() {
CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],0));
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
}
/// Run the kernel in the specified command queue
@ -245,6 +247,7 @@ class UCL_Kernel {
private:
CUfunction _kernel;
CUstream _cq;
unsigned _dimensions;
unsigned _num_blocks[2];
unsigned _num_args;

View File

@ -57,6 +57,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR;
mat.cq()=cm.cq();
return UCL_SUCCESS;
}
@ -72,6 +73,7 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
*(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
return UCL_MEMORY_ERROR;
mat.cq()=dev.cq();
return UCL_SUCCESS;
}
@ -92,6 +94,7 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
CUresult err=cuMemAlloc(&mat.cbegin(),n);
if (err!=CUDA_SUCCESS)
return UCL_MEMORY_ERROR;
mat.cq()=cm.cq();
return UCL_SUCCESS;
}
@ -101,6 +104,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
CUresult err=cuMemAlloc(&mat.cbegin(),n);
if (err!=CUDA_SUCCESS)
return UCL_MEMORY_ERROR;
mat.cq()=dev.cq();
return UCL_SUCCESS;
}
@ -115,6 +119,7 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
pitch=static_cast<size_t>(upitch);
if (err!=CUDA_SUCCESS)
return UCL_MEMORY_ERROR;
mat.cq()=cm.cq();
return UCL_SUCCESS;
}
@ -129,6 +134,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
pitch=static_cast<size_t>(upitch);
if (err!=CUDA_SUCCESS)
return UCL_MEMORY_ERROR;
mat.cq()=d.cq();
return UCL_SUCCESS;
}
@ -243,8 +249,8 @@ template<> struct _ucl_memcpy<2,2> {
const size_t rows) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstArray=dst.cbegin();
ins.srcArray=src.cbegin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -255,8 +261,8 @@ template<> struct _ucl_memcpy<2,2> {
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstArray=dst.cbegin();
ins.srcArray=src.cbegin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@ -280,8 +286,8 @@ template<> struct _ucl_memcpy<2,0> {
const size_t rows) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstArray=dst.cbegin();
ins.srcDevice=src.cbegin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -292,8 +298,8 @@ template<> struct _ucl_memcpy<2,0> {
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstArray=dst.cbegin();
ins.srcDevice=src.cbegin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@ -317,8 +323,8 @@ template<> struct _ucl_memcpy<2,1> {
const size_t rows) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstArray=dst.cbegin();
ins.srcHost=src.begin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -329,8 +335,8 @@ template<> struct _ucl_memcpy<2,1> {
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstArray=dst.cbegin();
ins.srcHost=src.begin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@ -354,8 +360,8 @@ template<> struct _ucl_memcpy<0,2> {
const size_t rows) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstDevice=dst.cbegin();
ins.srcArray=src.cbegin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -366,8 +372,8 @@ template<> struct _ucl_memcpy<0,2> {
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstDevice=dst.cbegin();
ins.srcArray=src.cbegin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@ -391,8 +397,8 @@ template<> struct _ucl_memcpy<1,2> {
const size_t rows) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstHost=dst.begin();
ins.srcArray=src.cbegin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -403,8 +409,8 @@ template<> struct _ucl_memcpy<1,2> {
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstHost=dst.begin();
ins.srcArray=src.cbegin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@ -428,8 +434,8 @@ template <> struct _ucl_memcpy<1,0> {
const size_t rows) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstHost=dst.begin();
ins.srcDevice=src.cbegin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -440,8 +446,8 @@ template <> struct _ucl_memcpy<1,0> {
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstHost=dst.begin();
ins.srcDevice=src.cbegin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@ -465,8 +471,8 @@ template <> struct _ucl_memcpy<0,1> {
const size_t rows) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstDevice=dst.cbegin();
ins.srcHost=src.begin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -477,8 +483,8 @@ template <> struct _ucl_memcpy<0,1> {
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstDevice=dst.cbegin();
ins.srcHost=src.begin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@ -500,8 +506,8 @@ template <> struct _ucl_memcpy<1,1> {
const size_t rows) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstHost=dst.begin();
ins.srcHost=src.begin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -512,8 +518,8 @@ template <> struct _ucl_memcpy<1,1> {
const size_t rows, CUstream &cq) {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstHost=dst.begin();
ins.srcHost=src.begin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@ -529,7 +535,7 @@ template <int mem1, int mem2> struct _ucl_memcpy {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
CUstream &cq) {
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin(),src.cbegin(),n));
CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin(),src.cbegin(),n,cq));
}
template <class p1, class p2>
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
@ -546,8 +552,8 @@ template <int mem1, int mem2> struct _ucl_memcpy {
} else {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstDevice=dst.cbegin();
ins.srcDevice=src.cbegin();
CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -560,16 +566,16 @@ template <int mem1, int mem2> struct _ucl_memcpy {
if (p1::PADDED==0 || p2::PADDED==0) {
size_t src_offset=0, dst_offset=0;
for (size_t i=0; i<rows; i++) {
CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset,
src.cbegin()+src_offset,cols));
CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin()+dst_offset,
src.cbegin()+src_offset,cols,cq));
src_offset+=spitch;
dst_offset+=dpitch;
}
} else {
CUDA_MEMCPY2D ins;
_nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
ins.dstDevice=dst.cbegin();
ins.srcDevice=src.cbegin();
CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));

View File

@ -124,16 +124,25 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica,
if (my_gpu>=gpu->num_devices())
return -2;
#ifndef CUDA_PRX
if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
return -7;
#endif
if (gpu->set(my_gpu)!=UCL_SUCCESS)
return -6;
gpu->push_command_queue();
gpu->set_command_queue(1);
_long_range_precompute=0;
int flag=compile_kernels();
int flag=0;
for (int i=0; i<_procs_per_gpu; i++) {
if (_gpu_rank==i)
flag=compile_kernels();
gpu_barrier();
}
return flag;
}

View File

@ -348,8 +348,9 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
// copy fp from device to host for comm
_nlocal=nlocal;
time_fp1.start();
ucl_copy(host_fp,dev_fp,nlocal,false);
ucl_copy(host_fp,dev_fp,nlocal,true);
time_fp1.stop();
time_fp1.sync_stop();
}
// ---------------------------------------------------------------------------
@ -427,8 +428,9 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
// copy fp from device to host for comm
_nlocal=inum_full;
time_fp1.start();
ucl_copy(host_fp,dev_fp,inum_full,false);
ucl_copy(host_fp,dev_fp,inum_full,true);
time_fp1.stop();
time_fp1.sync_stop();
return this->nbor->host_jlist.begin()-host_start;
}

View File

@ -69,10 +69,12 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
time_kernel.init(*dev);
time_hybrid1.init(*dev);
time_hybrid2.init(*dev);
time_transpose.init(*dev);
time_nbor.zero();
time_kernel.zero();
time_hybrid1.zero();
time_hybrid2.zero();
time_transpose.zero();
_max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
if (_max_atoms==0)
@ -203,6 +205,7 @@ void Neighbor::clear() {
time_nbor.clear();
time_hybrid1.clear();
time_hybrid2.clear();
time_transpose.clear();
}
}
@ -285,6 +288,41 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
}
}
template <class numtyp, class acctyp>
void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
if (maxn>_max_nbors) {
int mn=static_cast<int>(static_cast<double>(maxn)*1.10);
dev_nbor.clear();
success=success &&
(dev_nbor.alloc((mn+1)*_max_atoms,*dev)==UCL_SUCCESS);
_gpu_bytes=dev_nbor.row_bytes();
if (_max_host>0) {
host_nbor.clear();
dev_host_nbor.clear();
success=success && (host_nbor.alloc(mn*_max_host,*dev,
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
success=success && (dev_host_nbor.alloc(mn*_max_host,
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
int *ptr=host_nbor.begin();
for (int i=0; i<_max_host; i++) {
host_jlist[i]=ptr;
ptr+=mn;
}
_gpu_bytes+=dev_host_nbor.row_bytes();
} else {
dev_host_nbor.view(dev_nbor);
dev_host_numj.view(dev_nbor);
}
if (_alloc_packed) {
dev_packed.clear();
success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
_gpu_bytes+=dev_packed.row_bytes();
}
_max_nbors=mn;
}
}
template <class numtyp, class acctyp>
void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
const int nall, Atom<numtyp,acctyp> &atom,
@ -320,6 +358,29 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
if (_maxspecial>0) {
time_nbor.start();
UCL_H_Vec<int> view_nspecial, view_special, view_tag;
view_nspecial.view(nspecial[0],nt*3,*dev);
view_special.view(special[0],nt*_maxspecial,*dev);
view_tag.view(tag,nall,*dev);
ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
ucl_copy(atom.dev_tag,view_tag,nall,false);
time_nbor.stop();
if (_time_device)
time_nbor.add_to_total();
time_transpose.start();
const int b2x=_block_cell_2d;
const int b2y=_block_cell_2d;
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
_shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
&_maxspecial,&nt);
time_transpose.stop();
}
// If binning on CPU, do this now
if (_gpu_nbor==2) {
double stime = MPI_Wtime();
@ -352,6 +413,16 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
cell_id[i]=id;
host_cell_counts[id+1]++;
}
mn=0;
for (int i=0; i<_ncells; i++)
mn=std::max(mn,host_cell_counts[i]);
mn*=8;
resize_max_neighbors<numtyp,acctyp>(mn,success);
if (!success)
return;
_total_atoms=nt;
cell_iter[0]=0;
for (int i=1; i<_ncells; i++) {
host_cell_counts[i]+=host_cell_counts[i-1];
@ -372,28 +443,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
_bin_time+=MPI_Wtime()-stime;
}
if (_maxspecial>0) {
time_nbor.start();
UCL_H_Vec<int> view_nspecial, view_special, view_tag;
view_nspecial.view(nspecial[0],nt*3,*dev);
view_special.view(special[0],nt*_maxspecial,*dev);
view_tag.view(tag,nall,*dev);
ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
ucl_copy(atom.dev_tag,view_tag,nall,false);
time_nbor.stop();
if (_time_device)
time_nbor.add_to_total();
time_kernel.start();
const int b2x=_block_cell_2d;
const int b2y=_block_cell_2d;
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
_shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
&_maxspecial,&nt);
} else
time_kernel.start();
time_kernel.start();
_nbor_pitch=inum;
_shared->neigh_tex.bind_float(atom.dev_x,4);
@ -435,54 +485,30 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
/* Get the maximum number of nbors and realloc if necessary */
UCL_D_Vec<int> numj;
numj.view_offset(inum,dev_nbor,inum);
ucl_copy(host_acc,numj,inum,false);
ucl_copy(host_acc,numj,inum,true);
if (nt>inum) {
UCL_H_Vec<int> host_offset;
host_offset.view_offset(inum,host_acc,nt-inum);
ucl_copy(host_offset,dev_host_numj,nt-inum,false);
ucl_copy(host_offset,dev_host_numj,nt-inum,true);
}
mn=host_acc[0];
for (int i=1; i<nt; i++)
mn=std::max(mn,host_acc[i]);
if (_gpu_nbor!=2) {
host_acc.sync();
mn=host_acc[0];
for (int i=1; i<nt; i++)
mn=std::max(mn,host_acc[i]);
if (mn>_max_nbors) {
mn=static_cast<int>(static_cast<double>(mn)*1.10);
dev_nbor.clear();
success=success &&
(dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_x)==UCL_SUCCESS);
_gpu_bytes=dev_nbor.row_bytes();
if (_max_host>0) {
host_nbor.clear();
dev_host_nbor.clear();
success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
UCL_RW_OPTIMIZED)==UCL_SUCCESS);
success=success && (dev_host_nbor.alloc(mn*_max_host,
dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
int *ptr=host_nbor.begin();
for (int i=0; i<_max_host; i++) {
host_jlist[i]=ptr;
ptr+=mn;
}
_gpu_bytes+=dev_host_nbor.row_bytes();
} else {
dev_host_nbor.view(dev_nbor);
dev_host_numj.view(dev_nbor);
}
if (_alloc_packed) {
dev_packed.clear();
success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
UCL_READ_ONLY)==UCL_SUCCESS);
_gpu_bytes+=dev_packed.row_bytes();
}
if (!success)
if (mn>_max_nbors) {
resize_max_neighbors<numtyp,acctyp>(mn,success);
if (!success)
return;
time_kernel.stop();
if (_time_device)
time_kernel.add_to_total();
build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag,
nspecial, special, success, mn);
return;
_max_nbors=mn;
time_kernel.stop();
if (_time_device)
time_kernel.add_to_total();
build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag, nspecial,
special, success, mn);
return;
}
}
if (_maxspecial>0) {
@ -497,8 +523,10 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
time_kernel.stop();
time_nbor.start();
if (inum<nt)
ucl_copy(host_nbor,dev_host_nbor,false);
if (inum<nt) {
ucl_copy(host_nbor,dev_host_nbor,true);
host_nbor.sync();
}
time_nbor.stop();
}

View File

@ -106,13 +106,24 @@ class Neighbor {
inline void acc_timers() {
if (_nbor_time_avail) {
time_nbor.add_to_total();
time_kernel.add_to_total();
if (_gpu_nbor==2) {
time_hybrid1.add_to_total();
time_hybrid2.add_to_total();
int mn=0;
for (int i=0; i<_total_atoms; i++)
mn=std::max(mn,host_acc[i]);
if (mn>_max_nbors)
assert(0==1);
}
if (_time_device) {
time_nbor.add_to_total();
time_kernel.add_to_total();
if (_gpu_nbor==2) {
time_hybrid1.add_to_total();
time_hybrid2.add_to_total();
}
if (_maxspecial>0)
time_transpose.add_to_total();
_nbor_time_avail=false;
}
_nbor_time_avail=false;
}
}
@ -213,7 +224,7 @@ class Neighbor {
UCL_D_Vec<int> dev_cell_counts;
/// Device timers
UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2;
UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose;
private:
NeighborShared *_shared;
@ -228,6 +239,10 @@ class Neighbor {
int _block_cell_2d, _block_cell_id, _block_nbor_build, _ncells;
int _threads_per_atom;
int _total_atoms;
template <class numtyp, class acctyp>
inline void resize_max_neighbors(const int maxn, bool &success);
};
}

View File

@ -85,7 +85,7 @@ __kernel void kernel_calc_cell_counts(unsigned *cell_id,
__kernel void transpose(__global int *out, __global int *in, int columns_in,
int rows_in)
{
__local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
__local int block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
unsigned ti=THREAD_ID_X;
unsigned tj=THREAD_ID_Y;