forked from lijiext/lammps
1438 lines
43 KiB
Plaintext
1438 lines
43 KiB
Plaintext
|
/* ----------------------------------------------------------------------
|
||
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||
|
|
||
|
Original Version:
|
||
|
http://lammps.sandia.gov, Sandia National Laboratories
|
||
|
Steve Plimpton, sjplimp@sandia.gov
|
||
|
|
||
|
See the README file in the top-level LAMMPS directory.
|
||
|
|
||
|
-----------------------------------------------------------------------
|
||
|
|
||
|
USER-CUDA Package and associated modifications:
|
||
|
https://sourceforge.net/projects/lammpscuda/
|
||
|
|
||
|
Christian Trott, christian.trott@tu-ilmenau.de
|
||
|
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||
|
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||
|
|
||
|
See the README file in the USER-CUDA directory.
|
||
|
|
||
|
This software is distributed under the GNU General Public License.
|
||
|
------------------------------------------------------------------------- */
|
||
|
#define EWALD_F 1.12837917
|
||
|
#define EWALD_P 0.3275911
|
||
|
#define A1 0.254829592
|
||
|
#define A2 -0.284496736
|
||
|
#define A3 1.421413741
|
||
|
#define A4 -1.453152027
|
||
|
#define A5 1.061405429
|
||
|
|
||
|
|
||
|
template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
|
||
|
__global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_atom)
|
||
|
{
|
||
|
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
|
||
|
ENERGY_FLOAT ecoul = ENERGY_F(0.0);
|
||
|
|
||
|
ENERGY_FLOAT* sharedE;
|
||
|
ENERGY_FLOAT* sharedECoul;
|
||
|
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
|
||
|
|
||
|
if(eflag || eflag_atom) {
|
||
|
sharedE = &sharedmem[threadIdx.x];
|
||
|
sharedE[0] = ENERGY_F(0.0);
|
||
|
sharedV += blockDim.x;
|
||
|
|
||
|
if(coul_type != COUL_NONE) {
|
||
|
sharedECoul = sharedE + blockDim.x;
|
||
|
sharedECoul[0] = ENERGY_F(0.0);
|
||
|
sharedV += blockDim.x;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(vflag || vflag_atom) {
|
||
|
sharedV[0 * blockDim.x] = ENERGY_F(0.0);
|
||
|
sharedV[1 * blockDim.x] = ENERGY_F(0.0);
|
||
|
sharedV[2 * blockDim.x] = ENERGY_F(0.0);
|
||
|
sharedV[3 * blockDim.x] = ENERGY_F(0.0);
|
||
|
sharedV[4 * blockDim.x] = ENERGY_F(0.0);
|
||
|
sharedV[5 * blockDim.x] = ENERGY_F(0.0);
|
||
|
}
|
||
|
|
||
|
int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||
|
|
||
|
X_FLOAT xtmp, ytmp, ztmp;
|
||
|
X_FLOAT4 myxtype;
|
||
|
F_FLOAT fxtmp, fytmp, fztmp, fpair;
|
||
|
F_FLOAT delx, dely, delz;
|
||
|
F_FLOAT factor_lj, factor_coul;
|
||
|
F_FLOAT qtmp;
|
||
|
int itype, i, j;
|
||
|
int jnum = 0;
|
||
|
int* jlist;
|
||
|
|
||
|
if(ii < _inum) {
|
||
|
i = _ilist[ii];
|
||
|
|
||
|
myxtype = fetchXType(i);
|
||
|
xtmp = myxtype.x;
|
||
|
ytmp = myxtype.y;
|
||
|
ztmp = myxtype.z;
|
||
|
itype = static_cast <int>(myxtype.w);
|
||
|
|
||
|
|
||
|
fxtmp = F_F(0.0);
|
||
|
fytmp = F_F(0.0);
|
||
|
fztmp = F_F(0.0);
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
qtmp = fetchQ(i);
|
||
|
|
||
|
jnum = _numneigh[i];
|
||
|
jlist = &_neighbors[i];
|
||
|
}
|
||
|
|
||
|
__syncthreads();
|
||
|
|
||
|
for(int jj = 0; jj < jnum; jj++) {
|
||
|
if(ii < _inum)
|
||
|
if(jj < jnum) {
|
||
|
fpair = F_F(0.0);
|
||
|
j = jlist[jj * _nlocal];
|
||
|
factor_lj = _special_lj[sbmask(j)];
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
factor_coul = _special_coul[sbmask(j)];
|
||
|
|
||
|
j &= NEIGHMASK;
|
||
|
|
||
|
myxtype = fetchXType(j);
|
||
|
delx = xtmp - myxtype.x;
|
||
|
dely = ytmp - myxtype.y;
|
||
|
delz = ztmp - myxtype.z;
|
||
|
int jtype = static_cast <int>(myxtype.w);
|
||
|
|
||
|
|
||
|
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||
|
|
||
|
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
|
||
|
|
||
|
if(in_cutoff) {
|
||
|
switch(pair_type) {
|
||
|
case PAIR_BORN:
|
||
|
fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_BUCK:
|
||
|
fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_CG_CMM:
|
||
|
fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_CHARMM:
|
||
|
fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_CLASS2:
|
||
|
fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_CUT:
|
||
|
fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_EXPAND:
|
||
|
fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_GROMACS:
|
||
|
fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_SMOOTH:
|
||
|
fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ96_CUT:
|
||
|
fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_MORSE_R6:
|
||
|
fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_MORSE:
|
||
|
fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(coul_type != COUL_NONE) {
|
||
|
const F_FLOAT qiqj = qtmp * fetchQ(j);
|
||
|
|
||
|
if(qiqj * qiqj > 1e-8) {
|
||
|
const bool in_coul_cutoff =
|
||
|
rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
|
||
|
|
||
|
if(in_coul_cutoff) {
|
||
|
switch(coul_type) {
|
||
|
case COUL_CHARMM:
|
||
|
fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
|
||
|
break;
|
||
|
|
||
|
case COUL_CHARMM_IMPLICIT:
|
||
|
fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
|
||
|
break;
|
||
|
|
||
|
case COUL_CUT: {
|
||
|
const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
|
||
|
|
||
|
if(eflag) {
|
||
|
ecoul += forcecoul;
|
||
|
}
|
||
|
|
||
|
fpair += forcecoul * (F_F(1.0) / rsq);
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
case COUL_DEBYE: {
|
||
|
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||
|
const X_FLOAT r = _RSQRT_(r2inv);
|
||
|
const X_FLOAT rinv = F_F(1.0) / r;
|
||
|
const F_FLOAT screening = _EXP_(-_kappa * r);
|
||
|
F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
|
||
|
|
||
|
if(eflag) {
|
||
|
ecoul += forcecoul * rinv;
|
||
|
}
|
||
|
|
||
|
forcecoul *= (_kappa + rinv);
|
||
|
fpair += forcecoul * r2inv;
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
case COUL_GROMACS:
|
||
|
fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
|
||
|
break;
|
||
|
|
||
|
case COUL_LONG: {
|
||
|
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||
|
const F_FLOAT r = _RSQRT_(r2inv);
|
||
|
const F_FLOAT grij = _g_ewald * r;
|
||
|
const F_FLOAT expm2 = _EXP_(-grij * grij);
|
||
|
const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
|
||
|
const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
|
||
|
const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
|
||
|
F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||
|
|
||
|
if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
|
||
|
|
||
|
if(eflag) {
|
||
|
ecoul += prefactor * erfc;
|
||
|
|
||
|
if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
|
||
|
}
|
||
|
|
||
|
fpair += forcecoul * r2inv;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
in_cutoff = in_cutoff || in_coul_cutoff;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
if(in_cutoff) {
|
||
|
F_FLOAT dxfp, dyfp, dzfp;
|
||
|
fxtmp += dxfp = delx * fpair;
|
||
|
fytmp += dyfp = dely * fpair;
|
||
|
fztmp += dzfp = delz * fpair;
|
||
|
|
||
|
if(vflag) {
|
||
|
sharedV[0 * blockDim.x] += delx * dxfp;
|
||
|
sharedV[1 * blockDim.x] += dely * dyfp;
|
||
|
sharedV[2 * blockDim.x] += delz * dzfp;
|
||
|
sharedV[3 * blockDim.x] += delx * dyfp;
|
||
|
sharedV[4 * blockDim.x] += delx * dzfp;
|
||
|
sharedV[5 * blockDim.x] += dely * dzfp;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__syncthreads();
|
||
|
|
||
|
if(ii < _inum) {
|
||
|
F_FLOAT* my_f;
|
||
|
|
||
|
if(_collect_forces_later) {
|
||
|
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
||
|
|
||
|
if(eflag) {
|
||
|
buffer = &buffer[1 * gridDim.x * gridDim.y];
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
buffer = &buffer[1 * gridDim.x * gridDim.y];
|
||
|
}
|
||
|
|
||
|
if(vflag) {
|
||
|
buffer = &buffer[6 * gridDim.x * gridDim.y];
|
||
|
}
|
||
|
|
||
|
my_f = (F_FLOAT*) buffer;
|
||
|
my_f += i;
|
||
|
*my_f = fxtmp;
|
||
|
my_f += _nmax;
|
||
|
*my_f = fytmp;
|
||
|
my_f += _nmax;
|
||
|
*my_f = fztmp;
|
||
|
} else {
|
||
|
my_f = _f + i;
|
||
|
*my_f += fxtmp;
|
||
|
my_f += _nmax;
|
||
|
*my_f += fytmp;
|
||
|
my_f += _nmax;
|
||
|
*my_f += fztmp;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__syncthreads();
|
||
|
|
||
|
if(eflag) {
|
||
|
sharedE[0] = evdwl;
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
sharedECoul[0] = ecoul;
|
||
|
}
|
||
|
|
||
|
if(eflag_atom && i < _nlocal) {
|
||
|
if(coul_type != COUL_NONE)
|
||
|
_eatom[i] += evdwl + ecoul;
|
||
|
else
|
||
|
_eatom[i] += evdwl;
|
||
|
}
|
||
|
|
||
|
if(vflag_atom && i < _nlocal) {
|
||
|
_vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
|
||
|
_vatom[i + _nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
|
||
|
_vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
|
||
|
_vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
|
||
|
_vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
|
||
|
_vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
|
||
|
}
|
||
|
|
||
|
if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, coul_type != COUL_NONE ? 1 : 0);
|
||
|
}
|
||
|
|
||
|
template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
|
||
|
__global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_atom)
|
||
|
{
|
||
|
int ii = (blockIdx.x * gridDim.y + blockIdx.y);
|
||
|
|
||
|
if(ii >= _inum)
|
||
|
return;
|
||
|
|
||
|
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
|
||
|
ENERGY_FLOAT ecoul = ENERGY_F(0.0);
|
||
|
F_FLOAT3* sharedVirial1;
|
||
|
F_FLOAT3* sharedVirial2;
|
||
|
F_FLOAT* sharedEnergy;
|
||
|
F_FLOAT* sharedEnergyCoul;
|
||
|
|
||
|
F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0];
|
||
|
|
||
|
if(vflag) {
|
||
|
sharedVirial1 = &sharedForce[64];
|
||
|
sharedVirial2 = &sharedVirial1[64];
|
||
|
} else {
|
||
|
sharedVirial1 = &sharedForce[0];
|
||
|
sharedVirial2 = &sharedVirial1[0];
|
||
|
}
|
||
|
|
||
|
if(eflag) {
|
||
|
if(vflag || vflag_atom)
|
||
|
sharedEnergy = (F_FLOAT*) &sharedVirial2[64];
|
||
|
else
|
||
|
sharedEnergy = (F_FLOAT*) &sharedForce[64];
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64];
|
||
|
|
||
|
}
|
||
|
|
||
|
F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||
|
F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||
|
F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||
|
|
||
|
X_FLOAT xtmp, ytmp, ztmp;
|
||
|
X_FLOAT4 myxtype;
|
||
|
F_FLOAT delx, dely, delz;
|
||
|
F_FLOAT factor_lj, factor_coul;
|
||
|
F_FLOAT fpair;
|
||
|
F_FLOAT qtmp;
|
||
|
int itype, jnum, i, j;
|
||
|
int* jlist;
|
||
|
|
||
|
i = _ilist[ii];
|
||
|
|
||
|
myxtype = fetchXType(i);
|
||
|
|
||
|
xtmp = myxtype.x;
|
||
|
ytmp = myxtype.y;
|
||
|
ztmp = myxtype.z;
|
||
|
itype = static_cast <int>(myxtype.w);
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
qtmp = fetchQ(i);
|
||
|
|
||
|
jnum = _numneigh[i];
|
||
|
|
||
|
jlist = &_neighbors[i * _maxneighbors];
|
||
|
__syncthreads();
|
||
|
|
||
|
for(int jj = threadIdx.x; jj < jnum + blockDim.x; jj += blockDim.x) {
|
||
|
if(jj < jnum) {
|
||
|
fpair = F_F(0.0);
|
||
|
j = jlist[jj];
|
||
|
factor_lj = _special_lj[sbmask(j)];
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
factor_coul = _special_coul[sbmask(j)];
|
||
|
|
||
|
j &= NEIGHMASK;
|
||
|
|
||
|
myxtype = fetchXType(j);
|
||
|
|
||
|
delx = xtmp - myxtype.x;
|
||
|
dely = ytmp - myxtype.y;
|
||
|
delz = ztmp - myxtype.z;
|
||
|
int jtype = static_cast <int>(myxtype.w);
|
||
|
|
||
|
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||
|
|
||
|
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
|
||
|
bool in_coul_cutoff;
|
||
|
|
||
|
if(in_cutoff) {
|
||
|
switch(pair_type) {
|
||
|
case PAIR_BORN:
|
||
|
fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_BUCK:
|
||
|
fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_CG_CMM:
|
||
|
fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_CHARMM:
|
||
|
fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_CLASS2:
|
||
|
fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_CUT:
|
||
|
fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_EXPAND:
|
||
|
fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_GROMACS:
|
||
|
fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_SMOOTH:
|
||
|
fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ96_CUT:
|
||
|
fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_MORSE_R6:
|
||
|
fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_MORSE:
|
||
|
fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(coul_type != COUL_NONE) {
|
||
|
const F_FLOAT qiqj = qtmp * fetchQ(j);
|
||
|
|
||
|
if(qiqj * qiqj > (1e-8f)) {
|
||
|
in_coul_cutoff =
|
||
|
rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
|
||
|
|
||
|
if(in_coul_cutoff) {
|
||
|
switch(coul_type) {
|
||
|
case COUL_CHARMM:
|
||
|
fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
|
||
|
break;
|
||
|
|
||
|
case COUL_CHARMM_IMPLICIT:
|
||
|
fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
|
||
|
break;
|
||
|
|
||
|
case COUL_GROMACS:
|
||
|
fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
|
||
|
break;
|
||
|
|
||
|
case COUL_LONG: {
|
||
|
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||
|
const F_FLOAT r = _RSQRT_(r2inv);
|
||
|
const F_FLOAT grij = _g_ewald * r;
|
||
|
const F_FLOAT expm2 = _EXP_(-grij * grij);
|
||
|
const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
|
||
|
const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
|
||
|
const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
|
||
|
F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||
|
|
||
|
if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
|
||
|
|
||
|
if(eflag) {
|
||
|
ecoul += prefactor * erfc;
|
||
|
|
||
|
if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
|
||
|
}
|
||
|
|
||
|
fpair += forcecoul * r2inv;
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
case COUL_DEBYE: {
|
||
|
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||
|
const X_FLOAT r = _RSQRT_(r2inv);
|
||
|
const X_FLOAT rinv = F_F(1.0) / r;
|
||
|
const F_FLOAT screening = _EXP_(-_kappa * r);
|
||
|
F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
|
||
|
|
||
|
if(eflag) {
|
||
|
ecoul += forcecoul * rinv;
|
||
|
}
|
||
|
|
||
|
forcecoul *= (_kappa + rinv);
|
||
|
fpair += forcecoul * r2inv;
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
case COUL_CUT: {
|
||
|
const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
|
||
|
|
||
|
if(eflag) {
|
||
|
ecoul += forcecoul;
|
||
|
}
|
||
|
|
||
|
fpair += forcecoul * (F_F(1.0) / rsq);
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
if(in_cutoff || in_coul_cutoff) {
|
||
|
F_FLOAT dxfp, dyfp, dzfp;
|
||
|
partialForce.x += dxfp = delx * fpair;
|
||
|
partialForce.y += dyfp = dely * fpair;
|
||
|
partialForce.z += dzfp = delz * fpair;
|
||
|
|
||
|
if(vflag) {
|
||
|
partialVirial1.x += delx * dxfp;
|
||
|
partialVirial1.y += dely * dyfp;
|
||
|
partialVirial1.z += delz * dzfp;
|
||
|
partialVirial2.x += delx * dyfp;
|
||
|
partialVirial2.y += delx * dzfp;
|
||
|
partialVirial2.z += dely * dzfp;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(eflag) {
|
||
|
sharedEnergy[threadIdx.x] = evdwl;
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
sharedEnergyCoul[threadIdx.x] = ecoul;
|
||
|
}
|
||
|
|
||
|
sharedForce[threadIdx.x] = partialForce;
|
||
|
|
||
|
if(vflag) {
|
||
|
sharedVirial1[threadIdx.x] = partialVirial1;
|
||
|
sharedVirial2[threadIdx.x] = partialVirial2;
|
||
|
}
|
||
|
|
||
|
__syncthreads();
|
||
|
|
||
|
|
||
|
for(unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
|
||
|
|
||
|
if(threadIdx.x < s) {
|
||
|
sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x;
|
||
|
sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y;
|
||
|
sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z;
|
||
|
|
||
|
if(vflag) {
|
||
|
sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x;
|
||
|
sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y;
|
||
|
sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z;
|
||
|
|
||
|
sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x;
|
||
|
sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y;
|
||
|
sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z;
|
||
|
}
|
||
|
|
||
|
if(eflag) {
|
||
|
sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ];
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__syncthreads();
|
||
|
}
|
||
|
|
||
|
if(threadIdx.x == 0) {
|
||
|
|
||
|
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
||
|
|
||
|
if(eflag) {
|
||
|
ENERGY_FLOAT tmp_evdwl;
|
||
|
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0];
|
||
|
|
||
|
if(eflag_atom)
|
||
|
_eatom[i] = tmp_evdwl;
|
||
|
|
||
|
buffer = &buffer[gridDim.x * gridDim.y];
|
||
|
|
||
|
if(coul_type != COUL_NONE) {
|
||
|
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergyCoul[0];
|
||
|
|
||
|
if(eflag_atom)
|
||
|
_eatom[i] += tmp_evdwl;
|
||
|
|
||
|
buffer = &buffer[gridDim.x * gridDim.y];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(vflag) {
|
||
|
ENERGY_FLOAT tmp;
|
||
|
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
|
||
|
|
||
|
if(vflag_atom) _vatom[i + 0 * _nmax] = tmp;
|
||
|
|
||
|
buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].y;
|
||
|
|
||
|
if(vflag_atom) _vatom[i + 1 * _nmax] = tmp;
|
||
|
|
||
|
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].z;
|
||
|
|
||
|
if(vflag_atom) _vatom[i + 2 * _nmax] = tmp;
|
||
|
|
||
|
buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].x;
|
||
|
|
||
|
if(vflag_atom) _vatom[i + 3 * _nmax] = tmp;
|
||
|
|
||
|
buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].y;
|
||
|
|
||
|
if(vflag_atom) _vatom[i + 4 * _nmax] = tmp;
|
||
|
|
||
|
buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].z;
|
||
|
|
||
|
if(vflag_atom) _vatom[i + 5 * _nmax] = tmp;
|
||
|
|
||
|
buffer = &buffer[6 * gridDim.x * gridDim.y];
|
||
|
}
|
||
|
|
||
|
F_FLOAT* my_f;
|
||
|
|
||
|
if(_collect_forces_later) {
|
||
|
my_f = (F_FLOAT*) buffer;
|
||
|
my_f += i;
|
||
|
*my_f = sharedForce[0].x;
|
||
|
my_f += _nmax;
|
||
|
*my_f = sharedForce[0].y;
|
||
|
my_f += _nmax;
|
||
|
*my_f = sharedForce[0].z;
|
||
|
} else {
|
||
|
my_f = _f + i;
|
||
|
*my_f += sharedForce[0].x;
|
||
|
my_f += _nmax;
|
||
|
*my_f += sharedForce[0].y;
|
||
|
my_f += _nmax;
|
||
|
*my_f += sharedForce[0].z;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
|
||
|
__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase)
|
||
|
{
|
||
|
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
|
||
|
ENERGY_FLOAT ecoul = ENERGY_F(0.0);
|
||
|
|
||
|
ENERGY_FLOAT* sharedE;
|
||
|
ENERGY_FLOAT* sharedECoul;
|
||
|
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
|
||
|
|
||
|
if(eflag || eflag_atom) {
|
||
|
sharedE = &sharedmem[threadIdx.x];
|
||
|
sharedE[0] = ENERGY_F(0.0);
|
||
|
sharedV += blockDim.x;
|
||
|
|
||
|
if(coul_type != COUL_NONE) {
|
||
|
sharedECoul = sharedE + blockDim.x;
|
||
|
sharedECoul[0] = ENERGY_F(0.0);
|
||
|
sharedV += blockDim.x;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(vflag || vflag_atom) {
|
||
|
sharedV[0 * blockDim.x] = ENERGY_F(0.0);
|
||
|
sharedV[1 * blockDim.x] = ENERGY_F(0.0);
|
||
|
sharedV[2 * blockDim.x] = ENERGY_F(0.0);
|
||
|
sharedV[3 * blockDim.x] = ENERGY_F(0.0);
|
||
|
sharedV[4 * blockDim.x] = ENERGY_F(0.0);
|
||
|
sharedV[5 * blockDim.x] = ENERGY_F(0.0);
|
||
|
}
|
||
|
|
||
|
int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||
|
|
||
|
X_FLOAT xtmp, ytmp, ztmp;
|
||
|
X_FLOAT4 myxtype;
|
||
|
F_FLOAT fxtmp, fytmp, fztmp, fpair;
|
||
|
F_FLOAT delx, dely, delz;
|
||
|
F_FLOAT factor_lj, factor_coul;
|
||
|
F_FLOAT qtmp;
|
||
|
int itype, i, j;
|
||
|
int jnum = 0;
|
||
|
int* jlist;
|
||
|
|
||
|
if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) {
|
||
|
i = comm_phase < 2 ? _ilist[ii] : _ilist_border[ii] ;
|
||
|
|
||
|
myxtype = fetchXType(i);
|
||
|
myxtype = _x_type[i];
|
||
|
xtmp = myxtype.x;
|
||
|
ytmp = myxtype.y;
|
||
|
ztmp = myxtype.z;
|
||
|
itype = static_cast <int>(myxtype.w);
|
||
|
|
||
|
|
||
|
fxtmp = F_F(0.0);
|
||
|
fytmp = F_F(0.0);
|
||
|
fztmp = F_F(0.0);
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
qtmp = fetchQ(i);
|
||
|
|
||
|
jnum = comm_phase == 0 ? _numneigh[i] : (comm_phase == 1 ? _numneigh_inner[i] : _numneigh_border[ii]);
|
||
|
|
||
|
|
||
|
jlist = comm_phase == 0 ? &_neighbors[i] : (comm_phase == 1 ? &_neighbors_inner[i] : &_neighbors_border[ii]);
|
||
|
}
|
||
|
|
||
|
__syncthreads();
|
||
|
|
||
|
for(int jj = 0; jj < jnum; jj++) {
|
||
|
if(ii < (comm_phase < 2 ? _inum : _inum_border[0]))
|
||
|
if(jj < jnum) {
|
||
|
fpair = F_F(0.0);
|
||
|
j = jlist[jj * _nlocal];
|
||
|
|
||
|
factor_lj = j < _nall ? F_F(1.0) : _special_lj[j / _nall];
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
factor_coul = j < _nall ? F_F(1.0) : _special_coul[j / _nall];
|
||
|
|
||
|
j = j < _nall ? j : j % _nall;
|
||
|
|
||
|
myxtype = fetchXType(j);
|
||
|
delx = xtmp - myxtype.x;
|
||
|
dely = ytmp - myxtype.y;
|
||
|
delz = ztmp - myxtype.z;
|
||
|
int jtype = static_cast <int>(myxtype.w);
|
||
|
|
||
|
|
||
|
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||
|
|
||
|
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
|
||
|
|
||
|
if(in_cutoff) {
|
||
|
switch(pair_type) {
|
||
|
case PAIR_BORN:
|
||
|
fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_BUCK:
|
||
|
fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_CG_CMM:
|
||
|
fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_CHARMM:
|
||
|
fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_CLASS2:
|
||
|
fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_CUT:
|
||
|
fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_EXPAND:
|
||
|
fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_GROMACS:
|
||
|
fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_SMOOTH:
|
||
|
fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ96_CUT:
|
||
|
fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_MORSE_R6:
|
||
|
fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_MORSE:
|
||
|
fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(coul_type != COUL_NONE) {
|
||
|
const F_FLOAT qiqj = qtmp * fetchQ(j);
|
||
|
|
||
|
if(qiqj * qiqj > 1e-8) {
|
||
|
const bool in_coul_cutoff =
|
||
|
rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
|
||
|
|
||
|
if(in_coul_cutoff) {
|
||
|
switch(coul_type) {
|
||
|
case COUL_CHARMM:
|
||
|
fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
|
||
|
break;
|
||
|
|
||
|
case COUL_CHARMM_IMPLICIT:
|
||
|
fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
|
||
|
break;
|
||
|
|
||
|
case COUL_CUT: {
|
||
|
const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
|
||
|
|
||
|
if(eflag) {
|
||
|
ecoul += forcecoul;
|
||
|
}
|
||
|
|
||
|
fpair += forcecoul * (F_F(1.0) / rsq);
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
case COUL_DEBYE: {
|
||
|
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||
|
const X_FLOAT r = _RSQRT_(r2inv);
|
||
|
const X_FLOAT rinv = F_F(1.0) / r;
|
||
|
const F_FLOAT screening = _EXP_(-_kappa * r);
|
||
|
F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
|
||
|
|
||
|
if(eflag) {
|
||
|
ecoul += forcecoul * rinv;
|
||
|
}
|
||
|
|
||
|
forcecoul *= (_kappa + rinv);
|
||
|
fpair += forcecoul * r2inv;
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
case COUL_GROMACS:
|
||
|
fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
|
||
|
break;
|
||
|
|
||
|
case COUL_LONG: {
|
||
|
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||
|
const F_FLOAT r = _RSQRT_(r2inv);
|
||
|
const F_FLOAT grij = _g_ewald * r;
|
||
|
const F_FLOAT expm2 = _EXP_(-grij * grij);
|
||
|
const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
|
||
|
const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
|
||
|
const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
|
||
|
F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||
|
|
||
|
if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
|
||
|
|
||
|
if(eflag) {
|
||
|
ecoul += prefactor * erfc;
|
||
|
|
||
|
if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
|
||
|
}
|
||
|
|
||
|
fpair += forcecoul * r2inv;
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
}
|
||
|
}
|
||
|
|
||
|
in_cutoff = in_cutoff || in_coul_cutoff;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
if(in_cutoff) {
|
||
|
F_FLOAT dxfp, dyfp, dzfp;
|
||
|
fxtmp += dxfp = delx * fpair;
|
||
|
fytmp += dyfp = dely * fpair;
|
||
|
fztmp += dzfp = delz * fpair;
|
||
|
|
||
|
if(vflag) {
|
||
|
sharedV[0 * blockDim.x] += delx * dxfp;
|
||
|
sharedV[1 * blockDim.x] += dely * dyfp;
|
||
|
sharedV[2 * blockDim.x] += delz * dzfp;
|
||
|
sharedV[3 * blockDim.x] += delx * dyfp;
|
||
|
sharedV[4 * blockDim.x] += delx * dzfp;
|
||
|
sharedV[5 * blockDim.x] += dely * dzfp;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__syncthreads();
|
||
|
|
||
|
if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) {
|
||
|
F_FLOAT* my_f;
|
||
|
|
||
|
if(_collect_forces_later) {
|
||
|
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
||
|
|
||
|
if(eflag) {
|
||
|
buffer = &buffer[1 * gridDim.x * gridDim.y];
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
buffer = &buffer[1 * gridDim.x * gridDim.y];
|
||
|
}
|
||
|
|
||
|
if(vflag) {
|
||
|
buffer = &buffer[6 * gridDim.x * gridDim.y];
|
||
|
}
|
||
|
|
||
|
my_f = (F_FLOAT*) buffer;
|
||
|
my_f += i;
|
||
|
*my_f = fxtmp;
|
||
|
my_f += _nmax;
|
||
|
*my_f = fytmp;
|
||
|
my_f += _nmax;
|
||
|
*my_f = fztmp;
|
||
|
} else {
|
||
|
my_f = _f + i;
|
||
|
*my_f += fxtmp;
|
||
|
my_f += _nmax;
|
||
|
*my_f += fytmp;
|
||
|
my_f += _nmax;
|
||
|
*my_f += fztmp;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__syncthreads();
|
||
|
|
||
|
if(eflag) {
|
||
|
sharedE[0] = evdwl;
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
sharedECoul[0] = ecoul;
|
||
|
}
|
||
|
|
||
|
if(eflag_atom && i < _nlocal) {
|
||
|
if(coul_type != COUL_NONE)
|
||
|
_eatom[i] += evdwl + ecoul;
|
||
|
else
|
||
|
_eatom[i] += evdwl;
|
||
|
}
|
||
|
|
||
|
if(vflag_atom && i < _nlocal) {
|
||
|
_vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
|
||
|
_vatom[i + _nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
|
||
|
_vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
|
||
|
_vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
|
||
|
_vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
|
||
|
_vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
|
||
|
}
|
||
|
|
||
|
if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, coul_type != COUL_NONE ? 1 : 0);
|
||
|
}
|
||
|
|
||
|
template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
|
||
|
__global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase)
|
||
|
{
|
||
|
int ii = (blockIdx.x * gridDim.y + blockIdx.y);
|
||
|
|
||
|
if(ii >= (comm_phase < 2 ? _inum : _inum_border[0]))
|
||
|
return;
|
||
|
|
||
|
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
|
||
|
ENERGY_FLOAT ecoul = ENERGY_F(0.0);
|
||
|
F_FLOAT3* sharedVirial1;
|
||
|
F_FLOAT3* sharedVirial2;
|
||
|
F_FLOAT* sharedEnergy;
|
||
|
F_FLOAT* sharedEnergyCoul;
|
||
|
|
||
|
F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0];
|
||
|
|
||
|
if(vflag) {
|
||
|
sharedVirial1 = &sharedForce[64];
|
||
|
sharedVirial2 = &sharedVirial1[64];
|
||
|
} else {
|
||
|
sharedVirial1 = &sharedForce[0];
|
||
|
sharedVirial2 = &sharedVirial1[0];
|
||
|
}
|
||
|
|
||
|
if(eflag) {
|
||
|
if(vflag || vflag_atom)
|
||
|
sharedEnergy = (F_FLOAT*) &sharedVirial2[64];
|
||
|
else
|
||
|
sharedEnergy = (F_FLOAT*) &sharedForce[64];
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64];
|
||
|
|
||
|
}
|
||
|
|
||
|
F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||
|
F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||
|
F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) };
|
||
|
|
||
|
X_FLOAT xtmp, ytmp, ztmp;
|
||
|
X_FLOAT4 myxtype;
|
||
|
F_FLOAT delx, dely, delz;
|
||
|
F_FLOAT factor_lj, factor_coul;
|
||
|
F_FLOAT fpair;
|
||
|
F_FLOAT qtmp;
|
||
|
int itype, jnum, i, j;
|
||
|
int* jlist;
|
||
|
|
||
|
i = comm_phase < 2 ? _ilist[ii] : _ilist_border[ii];
|
||
|
|
||
|
myxtype = fetchXType(i);
|
||
|
|
||
|
xtmp = myxtype.x;
|
||
|
ytmp = myxtype.y;
|
||
|
ztmp = myxtype.z;
|
||
|
itype = static_cast <int>(myxtype.w);
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
qtmp = fetchQ(i);
|
||
|
|
||
|
jnum = comm_phase == 0 ? _numneigh[i] : (comm_phase == 1 ? _numneigh_inner[i] : _numneigh_border[ii]);
|
||
|
|
||
|
jlist = comm_phase == 0 ? &_neighbors[i * _maxneighbors] : (comm_phase == 1 ? &_neighbors_inner[i * _maxneighbors] : &_neighbors_border[ii * _maxneighbors]);
|
||
|
__syncthreads();
|
||
|
|
||
|
for(int jj = threadIdx.x; jj < jnum + blockDim.x; jj += blockDim.x) {
|
||
|
if(jj < jnum) {
|
||
|
fpair = F_F(0.0);
|
||
|
j = jlist[jj];
|
||
|
factor_lj = j < _nall ? F_F(1.0) : _special_lj[j / _nall];
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
factor_coul = j < _nall ? F_F(1.0) : _special_coul[j / _nall];
|
||
|
|
||
|
j = j < _nall ? j : j % _nall;
|
||
|
|
||
|
myxtype = fetchXType(j);
|
||
|
|
||
|
delx = xtmp - myxtype.x;
|
||
|
dely = ytmp - myxtype.y;
|
||
|
delz = ztmp - myxtype.z;
|
||
|
int jtype = static_cast <int>(myxtype.w);
|
||
|
|
||
|
const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||
|
|
||
|
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
|
||
|
bool in_coul_cutoff;
|
||
|
|
||
|
if(in_cutoff) {
|
||
|
switch(pair_type) {
|
||
|
case PAIR_BORN:
|
||
|
fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_BUCK:
|
||
|
fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_CG_CMM:
|
||
|
fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_CHARMM:
|
||
|
fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_CLASS2:
|
||
|
fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_CUT:
|
||
|
fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_EXPAND:
|
||
|
fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_GROMACS:
|
||
|
fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ_SMOOTH:
|
||
|
fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_LJ96_CUT:
|
||
|
fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_MORSE_R6:
|
||
|
fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
|
||
|
case PAIR_MORSE:
|
||
|
fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(coul_type != COUL_NONE) {
|
||
|
const F_FLOAT qiqj = qtmp * fetchQ(j);
|
||
|
|
||
|
if(qiqj * qiqj > (1e-8f)) {
|
||
|
in_coul_cutoff =
|
||
|
rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
|
||
|
|
||
|
if(in_coul_cutoff) {
|
||
|
switch(coul_type) {
|
||
|
case COUL_CHARMM:
|
||
|
fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
|
||
|
break;
|
||
|
|
||
|
case COUL_CHARMM_IMPLICIT:
|
||
|
fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
|
||
|
break;
|
||
|
|
||
|
case COUL_GROMACS:
|
||
|
fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
|
||
|
break;
|
||
|
|
||
|
case COUL_LONG: {
|
||
|
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||
|
const F_FLOAT r = _RSQRT_(r2inv);
|
||
|
const F_FLOAT grij = _g_ewald * r;
|
||
|
const F_FLOAT expm2 = _EXP_(-grij * grij);
|
||
|
const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
|
||
|
const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
|
||
|
const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
|
||
|
F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
|
||
|
|
||
|
if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
|
||
|
|
||
|
if(eflag) {
|
||
|
ecoul += prefactor * erfc;
|
||
|
|
||
|
if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
|
||
|
}
|
||
|
|
||
|
fpair += forcecoul * r2inv;
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
case COUL_DEBYE: {
|
||
|
const F_FLOAT r2inv = F_F(1.0) / rsq;
|
||
|
const X_FLOAT r = _RSQRT_(r2inv);
|
||
|
const X_FLOAT rinv = F_F(1.0) / r;
|
||
|
const F_FLOAT screening = _EXP_(-_kappa * r);
|
||
|
F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
|
||
|
|
||
|
if(eflag) {
|
||
|
ecoul += forcecoul * rinv;
|
||
|
}
|
||
|
|
||
|
forcecoul *= (_kappa + rinv);
|
||
|
fpair += forcecoul * r2inv;
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
case COUL_CUT: {
|
||
|
const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
|
||
|
|
||
|
if(eflag) {
|
||
|
ecoul += forcecoul;
|
||
|
}
|
||
|
|
||
|
fpair += forcecoul * (F_F(1.0) / rsq);
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
if(in_cutoff || in_coul_cutoff) {
|
||
|
F_FLOAT dxfp, dyfp, dzfp;
|
||
|
partialForce.x += dxfp = delx * fpair;
|
||
|
partialForce.y += dyfp = dely * fpair;
|
||
|
partialForce.z += dzfp = delz * fpair;
|
||
|
|
||
|
if(vflag) {
|
||
|
partialVirial1.x += delx * dxfp;
|
||
|
partialVirial1.y += dely * dyfp;
|
||
|
partialVirial1.z += delz * dzfp;
|
||
|
partialVirial2.x += delx * dyfp;
|
||
|
partialVirial2.y += delx * dzfp;
|
||
|
partialVirial2.z += dely * dzfp;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(eflag) {
|
||
|
sharedEnergy[threadIdx.x] = evdwl;
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
sharedEnergyCoul[threadIdx.x] = ecoul;
|
||
|
}
|
||
|
|
||
|
sharedForce[threadIdx.x] = partialForce;
|
||
|
|
||
|
if(vflag) {
|
||
|
sharedVirial1[threadIdx.x] = partialVirial1;
|
||
|
sharedVirial2[threadIdx.x] = partialVirial2;
|
||
|
}
|
||
|
|
||
|
__syncthreads();
|
||
|
|
||
|
|
||
|
for(unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
|
||
|
|
||
|
if(threadIdx.x < s) {
|
||
|
sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x;
|
||
|
sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y;
|
||
|
sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z;
|
||
|
|
||
|
if(vflag) {
|
||
|
sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x;
|
||
|
sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y;
|
||
|
sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z;
|
||
|
|
||
|
sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x;
|
||
|
sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y;
|
||
|
sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z;
|
||
|
}
|
||
|
|
||
|
if(eflag) {
|
||
|
sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ];
|
||
|
|
||
|
if(coul_type != COUL_NONE)
|
||
|
sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__syncthreads();
|
||
|
}
|
||
|
|
||
|
if(threadIdx.x == 0) {
|
||
|
|
||
|
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
||
|
|
||
|
if(eflag) {
|
||
|
ENERGY_FLOAT tmp_evdwl;
|
||
|
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0];
|
||
|
|
||
|
if(eflag_atom)
|
||
|
_eatom[i] = tmp_evdwl;
|
||
|
|
||
|
buffer = &buffer[gridDim.x * gridDim.y];
|
||
|
|
||
|
if(coul_type != COUL_NONE) {
|
||
|
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergyCoul[0];
|
||
|
|
||
|
if(eflag_atom)
|
||
|
_eatom[i] += tmp_evdwl;
|
||
|
|
||
|
buffer = &buffer[gridDim.x * gridDim.y];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(vflag) {
|
||
|
ENERGY_FLOAT tmp;
|
||
|
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
|
||
|
|
||
|
if(vflag_atom) _vatom[i + 0 * _nmax] = tmp;
|
||
|
|
||
|
buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].y;
|
||
|
|
||
|
if(vflag_atom) _vatom[i + 1 * _nmax] = tmp;
|
||
|
|
||
|
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].z;
|
||
|
|
||
|
if(vflag_atom) _vatom[i + 2 * _nmax] = tmp;
|
||
|
|
||
|
buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].x;
|
||
|
|
||
|
if(vflag_atom) _vatom[i + 3 * _nmax] = tmp;
|
||
|
|
||
|
buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].y;
|
||
|
|
||
|
if(vflag_atom) _vatom[i + 4 * _nmax] = tmp;
|
||
|
|
||
|
buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].z;
|
||
|
|
||
|
if(vflag_atom) _vatom[i + 5 * _nmax] = tmp;
|
||
|
|
||
|
buffer = &buffer[6 * gridDim.x * gridDim.y];
|
||
|
}
|
||
|
|
||
|
F_FLOAT* my_f;
|
||
|
|
||
|
if(_collect_forces_later) {
|
||
|
my_f = (F_FLOAT*) buffer;
|
||
|
my_f += i;
|
||
|
*my_f = sharedForce[0].x;
|
||
|
my_f += _nmax;
|
||
|
*my_f = sharedForce[0].y;
|
||
|
my_f += _nmax;
|
||
|
*my_f = sharedForce[0].z;
|
||
|
} else {
|
||
|
my_f = _f + i;
|
||
|
*my_f += sharedForce[0].x;
|
||
|
my_f += _nmax;
|
||
|
*my_f += sharedForce[0].y;
|
||
|
my_f += _nmax;
|
||
|
*my_f += sharedForce[0].z;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__global__ void Pair_GenerateXType_Kernel()
|
||
|
{
|
||
|
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||
|
|
||
|
if(i < _nall) {
|
||
|
X_FLOAT4 xtype;
|
||
|
xtype.x = _x[i];
|
||
|
xtype.y = _x[i + _nmax];
|
||
|
xtype.z = _x[i + 2 * _nmax];
|
||
|
xtype.w = _type[i];
|
||
|
_x_type[i] = xtype;
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
__global__ void Pair_GenerateVRadius_Kernel()
|
||
|
{
|
||
|
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||
|
|
||
|
if(i < _nall) {
|
||
|
V_FLOAT4 vradius;
|
||
|
vradius.x = _v[i];
|
||
|
vradius.y = _v[i + _nmax];
|
||
|
vradius.z = _v[i + 2 * _nmax];
|
||
|
vradius.w = _radius[i];
|
||
|
_v_radius[i] = vradius;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__global__ void Pair_GenerateOmegaRmass_Kernel()
|
||
|
{
|
||
|
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||
|
|
||
|
if(i < _nall) {
|
||
|
V_FLOAT4 omegarmass;
|
||
|
omegarmass.x = _omega[i];
|
||
|
omegarmass.y = _omega[i + _nmax];
|
||
|
omegarmass.z = _omega[i + 2 * _nmax];
|
||
|
omegarmass.w = _rmass[i];
|
||
|
_omega_rmass[i] = omegarmass;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__global__ void Pair_RevertXType_Kernel()
|
||
|
{
|
||
|
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||
|
|
||
|
if(i < _nall) {
|
||
|
X_FLOAT4 xtype = _x_type[i];
|
||
|
_x[i] = xtype.x;
|
||
|
_x[i + _nmax] = xtype.y;
|
||
|
_x[i + 2 * _nmax] = xtype.z;
|
||
|
_type[i] = static_cast <int>(xtype.w);
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
__global__ void Pair_BuildXHold_Kernel()
|
||
|
{
|
||
|
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||
|
|
||
|
if(i < _nall) {
|
||
|
X_FLOAT4 xtype = _x_type[i];
|
||
|
_xhold[i] = xtype.x;
|
||
|
_xhold[i + _nmax] = xtype.y;
|
||
|
_xhold[i + 2 * _nmax] = xtype.z;
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
__global__ void Pair_CollectForces_Kernel(int nperblock, int n)
|
||
|
{
|
||
|
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
|
||
|
|
||
|
if(i >= _nlocal) return;
|
||
|
|
||
|
ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
|
||
|
|
||
|
F_FLOAT* buf_f = (F_FLOAT*) &buf[nperblock * n];
|
||
|
F_FLOAT* my_f = _f + i;
|
||
|
buf_f += i;
|
||
|
*my_f += * buf_f;
|
||
|
my_f += _nmax;
|
||
|
buf_f += _nmax;
|
||
|
*my_f += * buf_f;
|
||
|
my_f += _nmax;
|
||
|
buf_f += _nmax;
|
||
|
*my_f += * buf_f;
|
||
|
my_f += _nmax;
|
||
|
}
|