Updated pair coul/long/cs and born/coul/long/cs; updated gpu neighbor builds to support core-shell styles where r2 can be tiny.

This commit is contained in:
Trung Nguyen 2018-06-19 15:50:02 -05:00
parent 6842a527e0
commit 70a7b37614
3 changed files with 92 additions and 83 deletions

View File

@ -9,7 +9,7 @@
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// begin : June 2018
// email : trung.nguyen@northwestern.edu
// ***************************************************************************/
@ -29,15 +29,19 @@ texture<int2> q_tex;
#define q_tex q_
#endif
#define B0 (numtyp)-0.1335096380159268
#define B1 (numtyp)-2.57839507e-1
#define B2 (numtyp)-1.37203639e-1
#define B3 (numtyp)-8.88822059e-3
#define B4 (numtyp)-5.80844129e-3
#define B5 (numtyp)1.14652755e-1
#define EPSILON (numtyp)1.0e-20
#define EPS_EWALD (numtyp)1.0e-6
#define EPS_EWALD_SQR (numtyp)1.0e-12
// Note: EWALD_P is different from that in lal_preprocessor.h
// acctyp is needed for these parameters
#define CS_EWALD_P (acctyp)9.95473818e-1
#define B0 (acctyp)-0.1335096380159268
#define B1 (acctyp)-2.57839507e-1
#define B2 (acctyp)-1.37203639e-1
#define B3 (acctyp)-8.88822059e-3
#define B4 (acctyp)-5.80844129e-3
#define B5 (acctyp)1.14652755e-1
#define EPSILON (acctyp)(1.0e-20)
#define EPS_EWALD (acctyp)(1.0e-6)
#define EPS_EWALD_SQR (acctyp)(1.0e-12)
__kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict coeff1,
@ -104,40 +108,38 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
numtyp rsq = delx*delx+dely*dely+delz*delz;
int mtype=itype*lj_types+jtype;
if (rsq<cutsq_sigma[mtype].x) { // cutsq
numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
numtyp rexp = (numtyp)0.0;
if (rsq<cutsq_sigma[mtype].x) { // cutsq
numtyp forcecoul,forceborn,force,r6inv,prefactor,_erfc,rexp;
rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
numtyp r2inv = ucl_recip(rsq);
if (rsq < cut_coulsq) {
rsq += EPSILON;
numtyp r2inv = ucl_recip(rsq);
numtyp r = ucl_rsqrt(r2inv);
numtyp r = ucl_sqrt(rsq);
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp;
if (factor_coul<(numtyp)1.0) {
numtyp grij = g_ewald * (r+EPS_EWALD);
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
numtyp u = (numtyp)1.0 - t;
_erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
prefactor /= (r+EPS_EWALD);
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2 - ((numtyp)1.0-factor_coul));
// Additionally r2inv needs to be accordingly modified since the later
// scaling of the overall force shall be consistent
r2inv = ucl_recip(rsq + EPS_EWALD_SQR);
forcecoul *= r2inv;
} else {
numtyp grij = g_ewald * r;
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
numtyp u = (numtyp)1.0 - t;
_erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
prefactor /= r;
forcecoul = prefactor*(_erfc + EWALD_F*grij*expm2);
forcecoul *= r2inv;
}
} else forcecoul = (numtyp)0.0;
numtyp r2inv = ucl_recip(rsq);
if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
numtyp r = ucl_sqrt(rsq);
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
@ -146,7 +148,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj;
} else forceborn = (numtyp)0.0;
force = forceborn*r2inv + forcecoul;
force = (forcecoul + forceborn) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
@ -154,9 +156,9 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
if (eflag>0) {
if (rsq < cut_coulsq) {
e_coul += prefactor*_erfc;
if (factor_coul<(numtyp)1.0)
e_coul -= ((numtyp)1.0-factor_coul)*prefactor;
numtyp e = prefactor*_erfc;
if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
e_coul += e;
}
if (rsq < cutsq_sigma[mtype].y) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
@ -248,39 +250,38 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq_sigma[mtype].x) { // cutsq
numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
numtyp rexp = (numtyp)0.0;
numtyp forcecoul,forceborn,force,r6inv,prefactor,_erfc,rexp;
rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
numtyp r2inv = ucl_recip(rsq);
if (rsq < cut_coulsq) {
rsq += EPSILON;
numtyp r2inv = ucl_recip(rsq);
numtyp r = ucl_rsqrt(r2inv);
numtyp r = ucl_sqrt(rsq);
fetch(prefactor,j,q_tex);
prefactor *= qqrd2e * qtmp;
if (factor_coul<(numtyp)1.0) {
numtyp grij = g_ewald * (r+EPS_EWALD);
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
numtyp u = (numtyp)1.0 - t;
_erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
prefactor /= (r+EPS_EWALD);
forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2 - ((numtyp)1.0-factor_coul));
// Additionally r2inv needs to be accordingly modified since the later
// scaling of the overall force shall be consistent
r2inv = ucl_recip(rsq + EPS_EWALD_SQR);
forcecoul *= r2inv;
} else {
numtyp grij = g_ewald * r;
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
numtyp u = (numtyp)1.0 - t;
_erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
prefactor /= r;
forcecoul = prefactor*(_erfc + EWALD_F*grij*expm2);
forcecoul *= r2inv;
}
} else forcecoul = (numtyp)0.0;
numtyp r2inv = ucl_recip(rsq);
if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
numtyp r = ucl_sqrt(rsq);
rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
@ -289,7 +290,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
+ coeff1[mtype].w*r2inv*r6inv)*factor_lj;
} else forceborn = (numtyp)0.0;
force = forceborn*r2inv + forcecoul;
force = (forcecoul + forceborn) * r2inv;
f.x+=delx*force;
f.y+=dely*force;
@ -297,9 +298,9 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
if (eflag>0) {
if (rsq < cut_coulsq) {
e_coul += prefactor*_erfc;
if (factor_coul<(numtyp)1.0)
e_coul -= ((numtyp)1.0-factor_coul)*prefactor;
numtyp e = prefactor*_erfc;
if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
e_coul += e;
}
if (rsq < cutsq_sigma[mtype].y) {
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv

View File

@ -29,6 +29,20 @@ texture<int2> q_tex;
#define q_tex q_
#endif
// Note: EWALD_P is different from that in lal_preprocessor.h
// acctyp is needed for these parameters
#define CS_EWALD_P (acctyp)9.95473818e-1
#define B0 (acctyp)-0.1335096380159268
#define B1 (acctyp)-2.57839507e-1
#define B2 (acctyp)-1.37203639e-1
#define B3 (acctyp)-8.88822059e-3
#define B4 (acctyp)-5.80844129e-3
#define B5 (acctyp)1.14652755e-1
#define EPSILON (acctyp)(1.0e-20)
#define EPS_EWALD (acctyp)(1.0e-6)
#define EPS_EWALD_SQR (acctyp)(1.0e-12)
#if (ARCH < 300)
#define store_answers_lq(f, e_coul, virial, ii, inum, tid, \
@ -123,16 +137,6 @@ texture<int2> q_tex;
#endif
#define B0 (numtyp)-0.1335096380159268
#define B1 (numtyp)-2.57839507e-1
#define B2 (numtyp)-1.37203639e-1
#define B3 (numtyp)-8.88822059e-3
#define B4 (numtyp)-5.80844129e-3
#define B5 (numtyp)1.14652755e-1
#define EPSILON (numtyp)1.0e-20
#define EPS_EWALD (numtyp)1.0e-6
#define EPS_EWALD_SQR (numtyp)1.0e-12
__kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
const __global numtyp *restrict scale,
const int lj_types,
@ -191,7 +195,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
int mtype=itype*lj_types+jtype;
if (rsq < cut_coulsq) {
rsq += EPSILON;
rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
numtyp force,prefactor,_erfc;
numtyp r2inv = ucl_recip(rsq);
@ -201,17 +205,19 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
if (factor_coul<(numtyp)1.0) {
numtyp grij = g_ewald * (r+EPS_EWALD);
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
numtyp u = (numtyp)1.0 - t;
_erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
prefactor /= (r+EPS_EWALD);
force = prefactor * (_erfc + EWALD_F*grij*expm2 - ((numtyp)1.0-factor_coul));
// Additionally r2inv needs to be accordingly modified since the later
// scaling of the overall force shall be consistent
r2inv = ucl_recip(rsq + EPS_EWALD_SQR);
force *= r2inv;
} else {
numtyp grij = g_ewald * r;
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
numtyp u = (numtyp)1.0 - t;
_erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
prefactor /= r;
@ -224,9 +230,9 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
f.z+=delz*force;
if (eflag>0) {
e_coul += prefactor*_erfc;
if (factor_coul<(numtyp)1.0)
e_coul -= ((numtyp)1.0-factor_coul)*prefactor;
numtyp e = prefactor*_erfc;
if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
e_coul += e;
}
if (vflag>0) {
virial[0] += delx*delx*force;
@ -304,7 +310,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq < cut_coulsq) {
rsq += EPSILON;
rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
numtyp force,prefactor,_erfc;
numtyp r2inv = ucl_recip(rsq);
@ -314,32 +320,34 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
if (factor_coul<(numtyp)1.0) {
numtyp grij = g_ewald * (r+EPS_EWALD);
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
numtyp u = (numtyp)1.0 - t;
_erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
prefactor /= (r+EPS_EWALD);
force = prefactor * (_erfc + EWALD_F*grij*expm2 - ((numtyp)1.0-factor_coul));
// Additionally r2inv needs to be accordingly modified since the later
// scaling of the overall force shall be consistent
r2inv = ucl_recip(rsq + EPS_EWALD_SQR);
force *= r2inv;
} else {
numtyp grij = g_ewald * r;
numtyp expm2 = ucl_exp(-grij*grij);
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
numtyp u = (numtyp)1.0 - t;
_erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
prefactor /= r;
force = prefactor * (_erfc + EWALD_F*grij*expm2);
force *= r2inv;
}
force *= r2inv;
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0) {
e_coul += prefactor*_erfc;
if (factor_coul<(numtyp)1.0)
e_coul -= ((numtyp)1.0-factor_coul)*prefactor;
numtyp e = prefactor*_erfc;
if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
e_coul += e;
}
if (vflag>0) {
virial[0] += delx*delx*force;

View File

@ -118,24 +118,24 @@ __kernel void transpose(__global tagint *restrict out,
const __global tagint *restrict in,
int columns_in, int rows_in)
{
__local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
__local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
unsigned ti=THREAD_ID_X;
unsigned tj=THREAD_ID_Y;
unsigned bi=BLOCK_ID_X;
unsigned bj=BLOCK_ID_Y;
unsigned ti=THREAD_ID_X;
unsigned tj=THREAD_ID_Y;
unsigned bi=BLOCK_ID_X;
unsigned bj=BLOCK_ID_Y;
unsigned i=bi*BLOCK_CELL_2D+ti;
unsigned j=bj*BLOCK_CELL_2D+tj;
if ((i<columns_in) && (j<rows_in))
block[tj][ti]=in[j*columns_in+i];
unsigned i=bi*BLOCK_CELL_2D+ti;
unsigned j=bj*BLOCK_CELL_2D+tj;
if ((i<columns_in) && (j<rows_in))
block[tj][ti]=in[j*columns_in+i];
__syncthreads();
__syncthreads();
i=bj*BLOCK_CELL_2D+ti;
j=bi*BLOCK_CELL_2D+tj;
if ((i<rows_in) && (j<columns_in))
out[j*rows_in+i] = block[ti][tj];
i=bj*BLOCK_CELL_2D+ti;
j=bi*BLOCK_CELL_2D+tj;
if ((i<rows_in) && (j<columns_in))
out[j*rows_in+i] = block[ti][tj];
}
__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
@ -191,7 +191,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
nbor_list[pid_i]=pid_i;
} else {
stride=0;
neigh_counts=host_numj+pid_i-inum;
neigh_counts=host_numj+pid_i-inum;
neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
}
@ -232,7 +232,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
diff.z = atom_i.z - pos_sh[j].z;
r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
if (r2 < cell_size*cell_size && r2 > 1e-5) {
if (r2 < cell_size*cell_size && pid_j != pid_i) { // && r2 > 1e-5
cnt++;
if (cnt <= neigh_bin_size) {
*neigh_list = pid_j;
@ -243,8 +243,8 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
}
}
}
__syncthreads();
} // for (k)
__syncthreads();
} // for (k)
}
}
}