diff --git a/lib/gpu/lal_lj_tip4p_long.cpp b/lib/gpu/lal_lj_tip4p_long.cpp index d44edc8cbd..f60f162d7b 100644 --- a/lib/gpu/lal_lj_tip4p_long.cpp +++ b/lib/gpu/lal_lj_tip4p_long.cpp @@ -74,11 +74,11 @@ int LJTIP4PLongT::init(const int ntypes, // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; shared_types=false; -// int max_shared_types=this->device->max_shared_types(); -// if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { -// lj_types=max_shared_types; -// shared_types=true; -// } + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } _lj_types=lj_types; // Allocate a host write buffer for data initialization @@ -202,7 +202,6 @@ void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) { GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); - this->k_pair.set_size(GX,BX); if (vflag) { this->ansO.resize_ib(ainum*3); } else { @@ -210,13 +209,25 @@ void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) { } this->ansO.zero(); this->device->gpu->sync(); - this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + if(shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &hneight, &m, &TypeO, &TypeH, &alpha, &this->atom->q, &cutsq, &_qqrd2e, &_g_ewald, &cut_coulsq, &cut_coulsqplus, &this->ansO); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom, + &hneight, &m, &TypeO, &TypeH, &alpha, + &this->atom->q, &cutsq, &_qqrd2e, &_g_ewald, + &cut_coulsq, &cut_coulsqplus, &this->ansO); + } GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); this->k_pair_distrib.set_size(GX,BX); this->k_pair_distrib.run(&this->atom->x, &this->ans->force, &this->ans->engv, diff --git a/lib/gpu/lal_lj_tip4p_long.cu b/lib/gpu/lal_lj_tip4p_long.cu index 147c460795..c101196433 100644 --- a/lib/gpu/lal_lj_tip4p_long.cu +++ b/lib/gpu/lal_lj_tip4p_long.cu @@ -572,4 +572,328 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_, } // if ii } -__kernel void k_lj_tip4p_long_fast() {} +__kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1_in, + const __global numtyp4 *restrict lj3_in, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom, + __global int *restrict hneigh, + __global numtyp4 *restrict m, + const int typeO, const int typeH, + const numtyp alpha, + const __global numtyp *restrict q_, + const __global numtyp *restrict cutsq, + const numtyp qqrd2e, const numtyp g_ewald, + const numtyp cut_coulsq, const numtyp cut_coulsqplus, + __global acctyp4 *restrict ansO) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + const numtyp eq_zero = 1e-6; + + __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[8]; + if (tid<8) + sp_lj[tid]=sp_lj_in[tid]; + if (tid0) + lj3[tid]=lj3_in[tid]; + } + acctyp energy = (acctyp)0; + acctyp e_coul = (acctyp)0; + acctyp4 f, fO; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0; + acctyp virial[6],vO[6]; + for (int i=0; i<6; i++) { + virial[i]=(acctyp)0; + vO[i]=(acctyp)0; + } + + __syncthreads(); + if (ii= inum) { + non_local_oxy = 1; + } + } + + for ( ; nbor0) { + numtyp e = r6inv * (lj3[mtype].x*r6inv-lj3[mtype].y); + energy += factor_lj * (e - lj3[mtype].z); + } + if (vflag>0) { + virial[0] += delx*delx*forcelj; + virial[1] += dely*dely*forcelj; + virial[2] += delz*delz*forcelj; + virial[3] += delx*dely*forcelj; + virial[4] += delx*delz*forcelj; + virial[5] += dely*delz*forcelj; + } + } // if LJ + + if (rsq < cut_coulsqplus) { //cut_coulsqplus + int jH1, jH2, jO; + numtyp qj; fetch(qj,j,q_tex); + numtyp4 x2 = jx; + if(itype == typeO || jtype == typeO) { + if (jtype == typeO) { + jO = j; + jH1 = hneigh[j*4 ]; + jH2 = hneigh[j*4+1]; + x2 = m[j]; + } + delx = x1.x-x2.x; + dely = x1.y-x2.y; + delz = x1.z-x2.z; + rsq = delx*delx+dely*dely+delz*delz; + } + if (rsq < cut_coulsq) { + numtyp r2inv = ucl_recip(rsq); + numtyp r = ucl_rsqrt(r2inv); + numtyp grij = g_ewald * r; + numtyp expm2 = ucl_exp(-grij*grij); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + + numtyp prefactor = qj; + prefactor *= qqrd2e*qtmp/r; + numtyp force_coul = r2inv*prefactor * (_erfc + EWALD_F*grij*expm2 - factor_coul); + + if (itype == typeH) { + f.x += delx * force_coul; + f.y += dely * force_coul; + f.z += delz * force_coul; + f.w += 0; + } else { + fO.x += delx * force_coul; + fO.y += dely * force_coul; + fO.z += delz * force_coul; + fO.w += 0; + } + if (eflag>0) { + e_coul += prefactor*(_erfc-factor_coul); + } + if (vflag>0) { + acctyp4 fd; + fd.x = delx*force_coul; + fd.y = dely*force_coul; + fd.z = delz*force_coul; + if (itype == typeH) { + if (jtype == typeH) { + virial[0] += delx*fd.x; + virial[1] += dely*fd.y; + virial[2] += delz*fd.z; + virial[3] += delx*fd.y; + virial[4] += delx*fd.z; + virial[5] += dely*fd.z; + } else { + numtyp cO = 1 - alpha, cH = 0.5*alpha; + numtyp4 vdj; + numtyp4 xjH1; fetch4(xjH1,jH1,pos_tex); + numtyp4 xjH2; fetch4(xjH2,jH2,pos_tex); + numtyp4 xjO; fetch4(xjO,jO,pos_tex); + vdj.x = xjO.x*cO + xjH1.x*cH + xjH2.x*cH; + vdj.y = xjO.y*cO + xjH1.y*cH + xjH2.y*cH; + vdj.z = xjO.z*cO + xjH1.z*cH + xjH2.z*cH; + //vdj.w = vdj.w; + virial[0] += (ix.x - vdj.x)*fd.x; + virial[1] += (ix.y - vdj.y)*fd.y; + virial[2] += (ix.z - vdj.z)*fd.z; + virial[3] += (ix.x - vdj.x)*fd.y; + virial[4] += (ix.x - vdj.x)*fd.z; + virial[5] += (ix.y - vdj.y)*fd.z; + } + } else { + numtyp cO = 1 - alpha, cH = 0.5*alpha; + numtyp4 vdi, vdj; + numtyp4 xH1; fetch4(xH1,iH1,pos_tex); + numtyp4 xH2; fetch4(xH2,iH2,pos_tex); + numtyp4 xO; fetch4(xO,iO,pos_tex); + vdi.x = xO.x*cO + xH1.x*cH + xH2.x*cH; + vdi.y = xO.y*cO + xH1.y*cH + xH2.y*cH; + vdi.z = xO.z*cO + xH1.z*cH + xH2.z*cH; + //vdi.w = vdi.w; + if (jtype != typeH) { + numtyp4 xjH1; fetch4(xjH1,jH1,pos_tex); + numtyp4 xjH2; fetch4(xjH2,jH2,pos_tex); + numtyp4 xjO; fetch4(xjO,jO,pos_tex); + vdj.x = xjO.x*cO + xjH1.x*cH + xjH2.x*cH; + vdj.y = xjO.y*cO + xjH1.y*cH + xjH2.y*cH; + vdj.z = xjO.z*cO + xjH1.z*cH + xjH2.z*cH; + //vdj.w = vdj.w; + } else vdj = jx; + vO[0] += 0.5*(vdi.x - vdj.x)*fd.x; + vO[1] += 0.5*(vdi.y - vdj.y)*fd.y; + vO[2] += 0.5*(vdi.z - vdj.z)*fd.z; + vO[3] += 0.5*(vdi.x - vdj.x)*fd.y; + vO[4] += 0.5*(vdi.x - vdj.x)*fd.z; + vO[5] += 0.5*(vdi.y - vdj.y)*fd.z; + } + } + } + if (non_local_oxy == 1) { + if (iO == j) { + x2 = ix; + qj = qtmp; + } + numtyp4 x1m = m[iO]; + delx = x1m.x-x2.x; + dely = x1m.y-x2.y; + delz = x1m.z-x2.z; + rsq = delx*delx+dely*dely+delz*delz; + if (rsq < cut_coulsq) { + numtyp r2inv = ucl_recip(rsq); + numtyp r = ucl_rsqrt(r2inv); + numtyp grij = g_ewald * r; + numtyp expm2 = ucl_exp(-grij*grij); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + + numtyp prefactor = qj; + prefactor *= qqrd2e*x1m.w/r; + numtyp force_coul = r2inv*prefactor * (_erfc + EWALD_F*grij*expm2 - factor_coul); + + numtyp cO = 1 - alpha, cH = 0.5*alpha; + numtyp4 fd; + fd.x = delx * force_coul * cH; + fd.y = dely * force_coul * cH; + fd.z = delz * force_coul * cH; + + f.x += fd.x; + f.y += fd.y; + f.z += fd.z; + + if (eflag>0) { + e_coul += prefactor*(_erfc-factor_coul) * (acctyp)0.5 * alpha; + } + if (vflag>0) { + numtyp4 xH1; fetch4(xH1,iH1,pos_tex); + numtyp4 xH2; fetch4(xH2,iH2,pos_tex); + numtyp4 xO; fetch4(xO,iO,pos_tex); + + virial[0] += ((xO.x*cO + xH1.x*cH + xH2.x*cH) - x2.x) * fd.x; + virial[1] += ((xO.y*cO + xH1.y*cH + xH2.y*cH) - x2.y) * fd.y; + virial[2] += ((xO.z*cO + xH1.z*cH + xH2.z*cH) - x2.z) * fd.z; + virial[3] += ((xO.x*cO + xH1.x*cH + xH2.x*cH) - x2.x) * fd.y; + virial[4] += ((xO.x*cO + xH1.x*cH + xH2.x*cH) - x2.x) * fd.z; + virial[5] += ((xO.y*cO + xH1.y*cH + xH2.y*cH) - x2.y) * fd.z; + } + } + } + } // if cut_coulsqplus + } // for nbor + if (t_per_atom>1) { +#if (ARCH < 300) + __local acctyp red_acc[6][BLOCK_PAIR]; + red_acc[0][tid]=fO.x; + red_acc[1][tid]=fO.y; + red_acc[2][tid]=fO.z; + red_acc[3][tid]=fO.w; + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<4; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + fO.x=red_acc[0][tid]; + fO.y=red_acc[1][tid]; + fO.z=red_acc[2][tid]; + fO.w=red_acc[3][tid]; + if (vflag>0) { + for (int r=0; r<6; r++) red_acc[r][tid]=vO[r]; + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + if (offset < s) { + for (int r=0; r<6; r++) + red_acc[r][tid] += red_acc[r][tid+s]; + } + } + for (int r=0; r<6; r++) vO[r]=red_acc[r][tid]; + } +#else + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + fO.x += shfl_xor(fO.x, s, t_per_atom); + fO.y += shfl_xor(fO.y, s, t_per_atom); + fO.z += shfl_xor(fO.z, s, t_per_atom); + fO.w += shfl_xor(fO.w, s, t_per_atom); + } + if (vflag>0) { + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + for (int r=0; r<6; r++) + vO[r] += shfl_xor(vO[r], s, t_per_atom); + } + } +#endif + } + if(offset == 0) { + ansO[i] = fO; + if (vflag>0) { + ansO[inum + i].x = vO[0]; + ansO[inum + i].y = vO[1]; + ansO[inum + i].z = vO[2]; + ansO[inum*2 + i].x = vO[3]; + ansO[inum*2 + i].y = vO[4]; + ansO[inum*2 + i].z = vO[5]; + } + } + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); + } // if ii +}