From 34fe2273f64bbee8f96ab91642106471ad77c25b Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 8 Jul 2017 14:59:48 -0500 Subject: [PATCH] Added short neighbor list implementation for tersoff/zbl/gpu and tersoff/mod/gpu --- lib/gpu/lal_tersoff_mod.cpp | 58 +++++++++--- lib/gpu/lal_tersoff_mod.cu | 173 ++++++++++++++++++++++++++++-------- lib/gpu/lal_tersoff_mod.h | 3 +- lib/gpu/lal_tersoff_zbl.cpp | 58 +++++++++--- lib/gpu/lal_tersoff_zbl.cu | 171 +++++++++++++++++++++++++++-------- lib/gpu/lal_tersoff_zbl.h | 2 +- 6 files changed, 365 insertions(+), 100 deletions(-) diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp index 553dad3583..ba1804c37e 100644 --- a/lib/gpu/lal_tersoff_mod.cpp +++ b/lib/gpu/lal_tersoff_mod.cpp @@ -55,7 +55,8 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,tersoff_mod,"k_tersoff_mod_repulsive", - "k_tersoff_mod_three_center", "k_tersoff_mod_three_end"); + "k_tersoff_mod_three_center", "k_tersoff_mod_three_end", + "k_tersoff_mod_short_nbor"); if (success!=0) return success; @@ -157,11 +158,16 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in UCL_H_Vec cutsq_view(nparams,*(this->ucl_device), UCL_WRITE_ONLY); - for (int i=0; i(host_cutsq[i]); + if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i]; + } cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); ucl_copy(cutsq,cutsq_view,false); + _cutshortsq = static_cast(cutsqmax); + UCL_H_Vec dview_elem2param(nelements*nelements*nelements, *(this->ucl_device), UCL_WRITE_ONLY); @@ -250,7 +256,7 @@ void TersoffMT::compute(const int f_ago, const int inum_full, const int nall, this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success); if (!success) return; - _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist); + this->_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist); } this->atom->cast_x_data(host_x,host_type); @@ -258,11 +264,13 @@ void TersoffMT::compute(const int f_ago, const int inum_full, const int nall, this->atom->add_x_data(host_x,host_type); // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { + if (nall*this->_max_nbors > _zetaij.cols()) { int _nmax=static_cast(static_cast(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); + _zetaij.resize(this->_max_nbors*_nmax); } + this->_ainum=nlist; + int _eflag; if (eflag) _eflag=1; @@ -329,7 +337,7 @@ int ** TersoffMT::compute(const int ago, const int inum_full, // Build neighbor list on GPU if necessary if (ago==0) { - _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + this->_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, success); if (!success) return NULL; @@ -343,11 +351,13 @@ int ** TersoffMT::compute(const int ago, const int inum_full, *jnum=this->nbor->host_acc.begin(); // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { + if (nall*this->_max_nbors > _zetaij.cols()) { int _nmax=static_cast(static_cast(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); + _zetaij.resize(this->_max_nbors*_nmax); } + this->_ainum=nall; + int _eflag; if (eflag) _eflag=1; @@ -402,9 +412,32 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { else vflag=0; - int ainum=this->ans->inum(); + // build the short neighbor list + int ainum=this->_ainum; int nbor_pitch=this->nbor->nbor_pitch(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/ + int GX=static_cast(ceil(static_cast(ainum)/ + (BX/this->_threads_per_atom))); + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &_cutshortsq, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast(ceil(static_cast(this->_ainum)/ + (BX/(JTHREADS*KTHREADS)))); + + this->k_zeta.set_size(GX,BX); + this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, + &map, &elem2param, &_nelements, &_nparams, &_zetaij, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); + + ainum=this->ans->inum(); + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); this->time_pair.start(); @@ -423,6 +456,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); @@ -437,7 +471,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); @@ -446,7 +480,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu index 3a81b36941..75bacc2179 100644 --- a/lib/gpu/lal_tersoff_mod.cu +++ b/lib/gpu/lal_tersoff_mod.cu @@ -106,7 +106,7 @@ texture ts5_tex; ans[ii]=old; \ } -#define store_zeta(z, tid, t_per_atom, offset) \ +#define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ __local acctyp red_acc[BLOCK_PAIR]; \ red_acc[tid]=z; \ @@ -155,7 +155,7 @@ texture ts5_tex; ans[ii]=old; \ } -#define store_zeta(z, tid, t_per_atom, offset) \ +#define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ z += shfl_xor(z, s, t_per_atom); \ @@ -164,6 +164,57 @@ texture ts5_tex; #endif +__kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_, + const numtyp cutshortsq, + const __global int * dev_nbor, + const __global int * dev_packed, + __global int * dev_short_nbor, + const int inum, const int nbor_pitch, + const int t_per_atom) { + __local int n_stride; + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + if (ii cutsq[ijparam]) continue; // compute zeta_ij - z = (numtyp)0; + z = (acctyp)0; int nbor_k = nborj_start-offset_j+offset_k; - for ( ; nbor_k < nbor_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int numk = dev_short_nbor[nbor_k-n_stride]; + int k_end = nbor_k+fast_mul(numk,n_stride); + + for ( ; nbor_k < k_end; nbor_k+=n_stride) { + int k=dev_short_nbor[nbor_k]; k &= NEIGHMASK; if (k == j) continue; @@ -287,10 +347,11 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_, //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride; //int idx = jj*n_stride + i*t_per_atom + offset_j; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - i, nbor_j, offset_j, idx); - store_zeta(z, tid, t_per_atom, offset_k); + //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor + int idx = nbor_j - n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// i, nbor_j, offset_j, idx); + acc_zeta(z, tid, t_per_atom, offset_k); numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex); numtyp ijparam_lam2 = ts1_ijparam.y; @@ -430,6 +491,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, const __global acctyp4 *restrict zetaij, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -470,15 +532,20 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); int offset_k=tid & (t_per_atom-1); - int nborj_start = nbor_j; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + int nborj_start = nbor_j; + nbor_end = nbor_j+fast_mul(numj,n_stride); + for ( ; nbor_j { UCL_Kernel k_zeta; UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex; - - int _max_nbors; + numtyp _cutshortsq; private: bool _allocated; diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp index 9cce8a802d..6efa8b9487 100644 --- a/lib/gpu/lal_tersoff_zbl.cpp +++ b/lib/gpu/lal_tersoff_zbl.cpp @@ -62,7 +62,8 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall, int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,tersoff_zbl,"k_tersoff_zbl_repulsive", - "k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end"); + "k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end", + "k_tersoff_zbl_short_nbor"); if (success!=0) return success; @@ -177,11 +178,16 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall, UCL_H_Vec cutsq_view(nparams,*(this->ucl_device), UCL_WRITE_ONLY); - for (int i=0; i(host_cutsq[i]); + if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i]; + } cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); ucl_copy(cutsq,cutsq_view,false); + _cutshortsq = static_cast(cutsqmax); + UCL_H_Vec dview_elem2param(nelements*nelements*nelements, *(this->ucl_device), UCL_WRITE_ONLY); @@ -275,7 +281,7 @@ void TersoffZT::compute(const int f_ago, const int inum_full, const int nall, this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success); if (!success) return; - _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist); + this->_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist); } this->atom->cast_x_data(host_x,host_type); @@ -283,11 +289,13 @@ void TersoffZT::compute(const int f_ago, const int inum_full, const int nall, this->atom->add_x_data(host_x,host_type); // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { + if (nall*this->_max_nbors > _zetaij.cols()) { int _nmax=static_cast(static_cast(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); + _zetaij.resize(this->_max_nbors*_nmax); } + this->_ainum=nlist; + int _eflag; if (eflag) _eflag=1; @@ -354,7 +362,7 @@ int ** TersoffZT::compute(const int ago, const int inum_full, // Build neighbor list on GPU if necessary if (ago==0) { - _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + this->_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, success); if (!success) return NULL; @@ -368,11 +376,13 @@ int ** TersoffZT::compute(const int ago, const int inum_full, *jnum=this->nbor->host_acc.begin(); // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { + if (nall*this->_max_nbors > _zetaij.cols()) { int _nmax=static_cast(static_cast(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); + _zetaij.resize(this->_max_nbors*_nmax); } + this->_ainum=nall; + int _eflag; if (eflag) _eflag=1; @@ -427,9 +437,32 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { else vflag=0; - int ainum=this->ans->inum(); + // build the short neighbor list + int ainum=this->_ainum; int nbor_pitch=this->nbor->nbor_pitch(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/ + int GX=static_cast(ceil(static_cast(ainum)/ + (BX/this->_threads_per_atom))); + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &_cutshortsq, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast(ceil(static_cast(this->_ainum)/ + (BX/(JTHREADS*KTHREADS)))); + + this->k_zeta.set_size(GX,BX); + this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, + &map, &elem2param, &_nelements, &_nparams, &_zetaij, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); + + ainum=this->ans->inum(); + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); this->time_pair.start(); @@ -449,6 +482,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); @@ -463,7 +497,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); @@ -472,7 +506,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu index 9509b9802c..439d4028df 100644 --- a/lib/gpu/lal_tersoff_zbl.cu +++ b/lib/gpu/lal_tersoff_zbl.cu @@ -109,7 +109,7 @@ texture ts6_tex; ans[ii]=old; \ } -#define store_zeta(z, tid, t_per_atom, offset) \ +#define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ __local acctyp red_acc[BLOCK_PAIR]; \ red_acc[tid]=z; \ @@ -158,7 +158,7 @@ texture ts6_tex; ans[ii]=old; \ } -#define store_zeta(z, tid, t_per_atom, offset) \ +#define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ z += shfl_xor(z, s, t_per_atom); \ @@ -167,6 +167,57 @@ texture ts6_tex; #endif +__kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_, + const numtyp cutshortsq, + const __global int * dev_nbor, + const __global int * dev_packed, + __global int * dev_short_nbor, + const int inum, const int nbor_pitch, + const int t_per_atom) { + __local int n_stride; + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + if (ii { UCL_Kernel k_zeta; UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex, ts6_tex; - int _max_nbors; numtyp _global_e,_global_a_0,_global_epsilon_0; + numtyp _cutshortsq; private: bool _allocated;