lammps/lib/cuda/cuda_pair_kernel.cu

1438 lines
43 KiB
Plaintext
Raw Normal View History

/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#define EWALD_F 1.12837917
#define EWALD_P 0.3275911
#define A1 0.254829592
#define A2 -0.284496736
#define A3 1.421413741
#define A4 -1.453152027
#define A5 1.061405429
template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
__global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_atom)
{
ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
ENERGY_CFLOAT ecoul = ENERGY_F(0.0);
ENERGY_CFLOAT* sharedE;
ENERGY_CFLOAT* sharedECoul;
ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x];
if(eflag || eflag_atom) {
sharedE = &sharedmem[threadIdx.x];
sharedE[0] = ENERGY_F(0.0);
sharedV += blockDim.x;
if(coul_type != COUL_NONE) {
sharedECoul = sharedE + blockDim.x;
sharedECoul[0] = ENERGY_F(0.0);
sharedV += blockDim.x;
}
}
if(vflag || vflag_atom) {
sharedV[0 * blockDim.x] = ENERGY_F(0.0);
sharedV[1 * blockDim.x] = ENERGY_F(0.0);
sharedV[2 * blockDim.x] = ENERGY_F(0.0);
sharedV[3 * blockDim.x] = ENERGY_F(0.0);
sharedV[4 * blockDim.x] = ENERGY_F(0.0);
sharedV[5 * blockDim.x] = ENERGY_F(0.0);
}
int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
X_CFLOAT xtmp, ytmp, ztmp;
X_CFLOAT4 myxtype;
F_CFLOAT fxtmp, fytmp, fztmp, fpair;
F_CFLOAT delx, dely, delz;
F_CFLOAT factor_lj, factor_coul;
F_CFLOAT qtmp;
int itype, i, j;
int jnum = 0;
int* jlist;
if(ii < _inum) {
i = _ilist[ii];
myxtype = fetchXType(i);
xtmp = myxtype.x;
ytmp = myxtype.y;
ztmp = myxtype.z;
itype = static_cast <int>(myxtype.w);
fxtmp = F_F(0.0);
fytmp = F_F(0.0);
fztmp = F_F(0.0);
if(coul_type != COUL_NONE)
qtmp = fetchQ(i);
jnum = _numneigh[i];
jlist = &_neighbors[i];
}
__syncthreads();
for(int jj = 0; jj < jnum; jj++) {
if(ii < _inum)
if(jj < jnum) {
fpair = F_F(0.0);
j = jlist[jj * _nlocal];
factor_lj = _special_lj[sbmask(j)];
if(coul_type != COUL_NONE)
factor_coul = _special_coul[sbmask(j)];
j &= NEIGHMASK;
myxtype = fetchXType(j);
delx = xtmp - myxtype.x;
dely = ytmp - myxtype.y;
delz = ztmp - myxtype.z;
int jtype = static_cast <int>(myxtype.w);
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
if(in_cutoff) {
switch(pair_type) {
case PAIR_BORN:
fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_BUCK:
fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_CG_CMM:
fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_CHARMM:
fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_CLASS2:
fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_CUT:
fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_EXPAND:
fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_GROMACS:
fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_SMOOTH:
fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ96_CUT:
fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_MORSE_R6:
fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_MORSE:
fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
}
}
if(coul_type != COUL_NONE) {
const F_CFLOAT qiqj = qtmp * fetchQ(j);
if(qiqj * qiqj > 1e-8) {
const bool in_coul_cutoff =
rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
if(in_coul_cutoff) {
switch(coul_type) {
case COUL_CHARMM:
fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
break;
case COUL_CHARMM_IMPLICIT:
fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
break;
case COUL_CUT: {
const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
if(eflag) {
ecoul += forcecoul;
}
fpair += forcecoul * (F_F(1.0) / rsq);
}
break;
case COUL_DEBYE: {
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const X_CFLOAT r = _RSQRT_(r2inv);
const X_CFLOAT rinv = F_F(1.0) / r;
const F_CFLOAT screening = _EXP_(-_kappa * r);
F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
if(eflag) {
ecoul += forcecoul * rinv;
}
forcecoul *= (_kappa + rinv);
fpair += forcecoul * r2inv;
}
break;
case COUL_GROMACS:
fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
break;
case COUL_LONG: {
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const F_CFLOAT r = _RSQRT_(r2inv);
const F_CFLOAT grij = _g_ewald * r;
const F_CFLOAT expm2 = _EXP_(-grij * grij);
const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
if(eflag) {
ecoul += prefactor * erfc;
if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
}
fpair += forcecoul * r2inv;
}
break;
}
}
in_cutoff = in_cutoff || in_coul_cutoff;
}
}
if(in_cutoff) {
F_CFLOAT dxfp, dyfp, dzfp;
fxtmp += dxfp = delx * fpair;
fytmp += dyfp = dely * fpair;
fztmp += dzfp = delz * fpair;
if(vflag) {
sharedV[0 * blockDim.x] += delx * dxfp;
sharedV[1 * blockDim.x] += dely * dyfp;
sharedV[2 * blockDim.x] += delz * dzfp;
sharedV[3 * blockDim.x] += delx * dyfp;
sharedV[4 * blockDim.x] += delx * dzfp;
sharedV[5 * blockDim.x] += dely * dzfp;
}
}
}
}
__syncthreads();
if(ii < _inum) {
F_CFLOAT* my_f;
if(_collect_forces_later) {
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(eflag) {
buffer = &buffer[1 * gridDim.x * gridDim.y];
if(coul_type != COUL_NONE)
buffer = &buffer[1 * gridDim.x * gridDim.y];
}
if(vflag) {
buffer = &buffer[6 * gridDim.x * gridDim.y];
}
my_f = (F_CFLOAT*) buffer;
my_f += i;
*my_f = fxtmp;
my_f += _nmax;
*my_f = fytmp;
my_f += _nmax;
*my_f = fztmp;
} else {
my_f = _f + i;
*my_f += fxtmp;
my_f += _nmax;
*my_f += fytmp;
my_f += _nmax;
*my_f += fztmp;
}
}
__syncthreads();
if(eflag) {
sharedE[0] = evdwl;
if(coul_type != COUL_NONE)
sharedECoul[0] = ecoul;
}
if(eflag_atom && i < _nlocal) {
if(coul_type != COUL_NONE)
_eatom[i] += evdwl + ecoul;
else
_eatom[i] += evdwl;
}
if(vflag_atom && i < _nlocal) {
_vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
_vatom[i + _nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
_vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
_vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
_vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
_vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
}
if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, coul_type != COUL_NONE ? 1 : 0);
}
template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
__global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_atom)
{
int ii = (blockIdx.x * gridDim.y + blockIdx.y);
if(ii >= _inum)
return;
ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
ENERGY_CFLOAT ecoul = ENERGY_F(0.0);
F_CFLOAT3* sharedVirial1;
F_CFLOAT3* sharedVirial2;
F_CFLOAT* sharedEnergy;
F_CFLOAT* sharedEnergyCoul;
F_CFLOAT3* sharedForce = (F_CFLOAT3*) &sharedmem[0];
if(vflag) {
sharedVirial1 = &sharedForce[64];
sharedVirial2 = &sharedVirial1[64];
} else {
sharedVirial1 = &sharedForce[0];
sharedVirial2 = &sharedVirial1[0];
}
if(eflag) {
if(vflag || vflag_atom)
sharedEnergy = (F_CFLOAT*) &sharedVirial2[64];
else
sharedEnergy = (F_CFLOAT*) &sharedForce[64];
if(coul_type != COUL_NONE)
sharedEnergyCoul = (F_CFLOAT*) &sharedEnergy[64];
}
F_CFLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_CFLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_CFLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) };
X_CFLOAT xtmp, ytmp, ztmp;
X_CFLOAT4 myxtype;
F_CFLOAT delx, dely, delz;
F_CFLOAT factor_lj, factor_coul;
F_CFLOAT fpair;
F_CFLOAT qtmp;
int itype, jnum, i, j;
int* jlist;
i = _ilist[ii];
myxtype = fetchXType(i);
xtmp = myxtype.x;
ytmp = myxtype.y;
ztmp = myxtype.z;
itype = static_cast <int>(myxtype.w);
if(coul_type != COUL_NONE)
qtmp = fetchQ(i);
jnum = _numneigh[i];
jlist = &_neighbors[i * _maxneighbors];
__syncthreads();
for(int jj = threadIdx.x; jj < jnum + blockDim.x; jj += blockDim.x) {
if(jj < jnum) {
fpair = F_F(0.0);
j = jlist[jj];
factor_lj = _special_lj[sbmask(j)];
if(coul_type != COUL_NONE)
factor_coul = _special_coul[sbmask(j)];
j &= NEIGHMASK;
myxtype = fetchXType(j);
delx = xtmp - myxtype.x;
dely = ytmp - myxtype.y;
delz = ztmp - myxtype.z;
int jtype = static_cast <int>(myxtype.w);
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
bool in_coul_cutoff;
if(in_cutoff) {
switch(pair_type) {
case PAIR_BORN:
fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_BUCK:
fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_CG_CMM:
fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_CHARMM:
fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_CLASS2:
fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_CUT:
fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_EXPAND:
fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_GROMACS:
fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_SMOOTH:
fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ96_CUT:
fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_MORSE_R6:
fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_MORSE:
fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
}
}
if(coul_type != COUL_NONE) {
const F_CFLOAT qiqj = qtmp * fetchQ(j);
if(qiqj * qiqj > (1e-8f)) {
in_coul_cutoff =
rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
if(in_coul_cutoff) {
switch(coul_type) {
case COUL_CHARMM:
fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
break;
case COUL_CHARMM_IMPLICIT:
fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
break;
case COUL_GROMACS:
fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
break;
case COUL_LONG: {
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const F_CFLOAT r = _RSQRT_(r2inv);
const F_CFLOAT grij = _g_ewald * r;
const F_CFLOAT expm2 = _EXP_(-grij * grij);
const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
if(eflag) {
ecoul += prefactor * erfc;
if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
}
fpair += forcecoul * r2inv;
}
break;
case COUL_DEBYE: {
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const X_CFLOAT r = _RSQRT_(r2inv);
const X_CFLOAT rinv = F_F(1.0) / r;
const F_CFLOAT screening = _EXP_(-_kappa * r);
F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
if(eflag) {
ecoul += forcecoul * rinv;
}
forcecoul *= (_kappa + rinv);
fpair += forcecoul * r2inv;
}
break;
case COUL_CUT: {
const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
if(eflag) {
ecoul += forcecoul;
}
fpair += forcecoul * (F_F(1.0) / rsq);
}
break;
}
}
}
}
if(in_cutoff || in_coul_cutoff) {
F_CFLOAT dxfp, dyfp, dzfp;
partialForce.x += dxfp = delx * fpair;
partialForce.y += dyfp = dely * fpair;
partialForce.z += dzfp = delz * fpair;
if(vflag) {
partialVirial1.x += delx * dxfp;
partialVirial1.y += dely * dyfp;
partialVirial1.z += delz * dzfp;
partialVirial2.x += delx * dyfp;
partialVirial2.y += delx * dzfp;
partialVirial2.z += dely * dzfp;
}
}
}
}
if(eflag) {
sharedEnergy[threadIdx.x] = evdwl;
if(coul_type != COUL_NONE)
sharedEnergyCoul[threadIdx.x] = ecoul;
}
sharedForce[threadIdx.x] = partialForce;
if(vflag) {
sharedVirial1[threadIdx.x] = partialVirial1;
sharedVirial2[threadIdx.x] = partialVirial2;
}
__syncthreads();
for(unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
if(threadIdx.x < s) {
sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x;
sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y;
sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z;
if(vflag) {
sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x;
sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y;
sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z;
sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x;
sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y;
sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z;
}
if(eflag) {
sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ];
if(coul_type != COUL_NONE)
sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ];
}
}
__syncthreads();
}
if(threadIdx.x == 0) {
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(eflag) {
ENERGY_CFLOAT tmp_evdwl;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0];
if(eflag_atom)
_eatom[i] = tmp_evdwl;
buffer = &buffer[gridDim.x * gridDim.y];
if(coul_type != COUL_NONE) {
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergyCoul[0];
if(eflag_atom)
_eatom[i] += tmp_evdwl;
buffer = &buffer[gridDim.x * gridDim.y];
}
}
if(vflag) {
ENERGY_CFLOAT tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
if(vflag_atom) _vatom[i + 0 * _nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].y;
if(vflag_atom) _vatom[i + 1 * _nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].z;
if(vflag_atom) _vatom[i + 2 * _nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].x;
if(vflag_atom) _vatom[i + 3 * _nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].y;
if(vflag_atom) _vatom[i + 4 * _nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].z;
if(vflag_atom) _vatom[i + 5 * _nmax] = tmp;
buffer = &buffer[6 * gridDim.x * gridDim.y];
}
F_CFLOAT* my_f;
if(_collect_forces_later) {
my_f = (F_CFLOAT*) buffer;
my_f += i;
*my_f = sharedForce[0].x;
my_f += _nmax;
*my_f = sharedForce[0].y;
my_f += _nmax;
*my_f = sharedForce[0].z;
} else {
my_f = _f + i;
*my_f += sharedForce[0].x;
my_f += _nmax;
*my_f += sharedForce[0].y;
my_f += _nmax;
*my_f += sharedForce[0].z;
}
}
}
template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase)
{
ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
ENERGY_CFLOAT ecoul = ENERGY_F(0.0);
ENERGY_CFLOAT* sharedE;
ENERGY_CFLOAT* sharedECoul;
ENERGY_CFLOAT* sharedV = &sharedmem[threadIdx.x];
if(eflag || eflag_atom) {
sharedE = &sharedmem[threadIdx.x];
sharedE[0] = ENERGY_F(0.0);
sharedV += blockDim.x;
if(coul_type != COUL_NONE) {
sharedECoul = sharedE + blockDim.x;
sharedECoul[0] = ENERGY_F(0.0);
sharedV += blockDim.x;
}
}
if(vflag || vflag_atom) {
sharedV[0 * blockDim.x] = ENERGY_F(0.0);
sharedV[1 * blockDim.x] = ENERGY_F(0.0);
sharedV[2 * blockDim.x] = ENERGY_F(0.0);
sharedV[3 * blockDim.x] = ENERGY_F(0.0);
sharedV[4 * blockDim.x] = ENERGY_F(0.0);
sharedV[5 * blockDim.x] = ENERGY_F(0.0);
}
int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
X_CFLOAT xtmp, ytmp, ztmp;
X_CFLOAT4 myxtype;
F_CFLOAT fxtmp, fytmp, fztmp, fpair;
F_CFLOAT delx, dely, delz;
F_CFLOAT factor_lj, factor_coul;
F_CFLOAT qtmp;
int itype, i, j;
int jnum = 0;
int* jlist;
if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) {
i = comm_phase < 2 ? _ilist[ii] : _ilist_border[ii] ;
myxtype = fetchXType(i);
myxtype = _x_type[i];
xtmp = myxtype.x;
ytmp = myxtype.y;
ztmp = myxtype.z;
itype = static_cast <int>(myxtype.w);
fxtmp = F_F(0.0);
fytmp = F_F(0.0);
fztmp = F_F(0.0);
if(coul_type != COUL_NONE)
qtmp = fetchQ(i);
jnum = comm_phase == 0 ? _numneigh[i] : (comm_phase == 1 ? _numneigh_inner[i] : _numneigh_border[ii]);
jlist = comm_phase == 0 ? &_neighbors[i] : (comm_phase == 1 ? &_neighbors_inner[i] : &_neighbors_border[ii]);
}
__syncthreads();
for(int jj = 0; jj < jnum; jj++) {
if(ii < (comm_phase < 2 ? _inum : _inum_border[0]))
if(jj < jnum) {
fpair = F_F(0.0);
j = jlist[jj * _nlocal];
factor_lj = j < _nall ? F_F(1.0) : _special_lj[j / _nall];
if(coul_type != COUL_NONE)
factor_coul = j < _nall ? F_F(1.0) : _special_coul[j / _nall];
j = j < _nall ? j : j % _nall;
myxtype = fetchXType(j);
delx = xtmp - myxtype.x;
dely = ytmp - myxtype.y;
delz = ztmp - myxtype.z;
int jtype = static_cast <int>(myxtype.w);
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
if(in_cutoff) {
switch(pair_type) {
case PAIR_BORN:
fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_BUCK:
fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_CG_CMM:
fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_CHARMM:
fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_CLASS2:
fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_CUT:
fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_EXPAND:
fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_GROMACS:
fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_SMOOTH:
fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ96_CUT:
fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_MORSE_R6:
fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_MORSE:
fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
}
}
if(coul_type != COUL_NONE) {
const F_CFLOAT qiqj = qtmp * fetchQ(j);
if(qiqj * qiqj > 1e-8) {
const bool in_coul_cutoff =
rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
if(in_coul_cutoff) {
switch(coul_type) {
case COUL_CHARMM:
fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
break;
case COUL_CHARMM_IMPLICIT:
fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
break;
case COUL_CUT: {
const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
if(eflag) {
ecoul += forcecoul;
}
fpair += forcecoul * (F_F(1.0) / rsq);
}
break;
case COUL_DEBYE: {
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const X_CFLOAT r = _RSQRT_(r2inv);
const X_CFLOAT rinv = F_F(1.0) / r;
const F_CFLOAT screening = _EXP_(-_kappa * r);
F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
if(eflag) {
ecoul += forcecoul * rinv;
}
forcecoul *= (_kappa + rinv);
fpair += forcecoul * r2inv;
}
break;
case COUL_GROMACS:
fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
break;
case COUL_LONG: {
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const F_CFLOAT r = _RSQRT_(r2inv);
const F_CFLOAT grij = _g_ewald * r;
const F_CFLOAT expm2 = _EXP_(-grij * grij);
const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
if(eflag) {
ecoul += prefactor * erfc;
if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
}
fpair += forcecoul * r2inv;
}
break;
}
}
in_cutoff = in_cutoff || in_coul_cutoff;
}
}
if(in_cutoff) {
F_CFLOAT dxfp, dyfp, dzfp;
fxtmp += dxfp = delx * fpair;
fytmp += dyfp = dely * fpair;
fztmp += dzfp = delz * fpair;
if(vflag) {
sharedV[0 * blockDim.x] += delx * dxfp;
sharedV[1 * blockDim.x] += dely * dyfp;
sharedV[2 * blockDim.x] += delz * dzfp;
sharedV[3 * blockDim.x] += delx * dyfp;
sharedV[4 * blockDim.x] += delx * dzfp;
sharedV[5 * blockDim.x] += dely * dzfp;
}
}
}
}
__syncthreads();
if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) {
F_CFLOAT* my_f;
if(_collect_forces_later) {
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(eflag) {
buffer = &buffer[1 * gridDim.x * gridDim.y];
if(coul_type != COUL_NONE)
buffer = &buffer[1 * gridDim.x * gridDim.y];
}
if(vflag) {
buffer = &buffer[6 * gridDim.x * gridDim.y];
}
my_f = (F_CFLOAT*) buffer;
my_f += i;
*my_f = fxtmp;
my_f += _nmax;
*my_f = fytmp;
my_f += _nmax;
*my_f = fztmp;
} else {
my_f = _f + i;
*my_f += fxtmp;
my_f += _nmax;
*my_f += fytmp;
my_f += _nmax;
*my_f += fztmp;
}
}
__syncthreads();
if(eflag) {
sharedE[0] = evdwl;
if(coul_type != COUL_NONE)
sharedECoul[0] = ecoul;
}
if(eflag_atom && i < _nlocal) {
if(coul_type != COUL_NONE)
_eatom[i] += evdwl + ecoul;
else
_eatom[i] += evdwl;
}
if(vflag_atom && i < _nlocal) {
_vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
_vatom[i + _nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
_vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
_vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
_vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
_vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
}
if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, coul_type != COUL_NONE ? 1 : 0);
}
template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
__global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase)
{
int ii = (blockIdx.x * gridDim.y + blockIdx.y);
if(ii >= (comm_phase < 2 ? _inum : _inum_border[0]))
return;
ENERGY_CFLOAT evdwl = ENERGY_F(0.0);
ENERGY_CFLOAT ecoul = ENERGY_F(0.0);
F_CFLOAT3* sharedVirial1;
F_CFLOAT3* sharedVirial2;
F_CFLOAT* sharedEnergy;
F_CFLOAT* sharedEnergyCoul;
F_CFLOAT3* sharedForce = (F_CFLOAT3*) &sharedmem[0];
if(vflag) {
sharedVirial1 = &sharedForce[64];
sharedVirial2 = &sharedVirial1[64];
} else {
sharedVirial1 = &sharedForce[0];
sharedVirial2 = &sharedVirial1[0];
}
if(eflag) {
if(vflag || vflag_atom)
sharedEnergy = (F_CFLOAT*) &sharedVirial2[64];
else
sharedEnergy = (F_CFLOAT*) &sharedForce[64];
if(coul_type != COUL_NONE)
sharedEnergyCoul = (F_CFLOAT*) &sharedEnergy[64];
}
F_CFLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_CFLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_CFLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) };
X_CFLOAT xtmp, ytmp, ztmp;
X_CFLOAT4 myxtype;
F_CFLOAT delx, dely, delz;
F_CFLOAT factor_lj, factor_coul;
F_CFLOAT fpair;
F_CFLOAT qtmp;
int itype, jnum, i, j;
int* jlist;
i = comm_phase < 2 ? _ilist[ii] : _ilist_border[ii];
myxtype = fetchXType(i);
xtmp = myxtype.x;
ytmp = myxtype.y;
ztmp = myxtype.z;
itype = static_cast <int>(myxtype.w);
if(coul_type != COUL_NONE)
qtmp = fetchQ(i);
jnum = comm_phase == 0 ? _numneigh[i] : (comm_phase == 1 ? _numneigh_inner[i] : _numneigh_border[ii]);
jlist = comm_phase == 0 ? &_neighbors[i * _maxneighbors] : (comm_phase == 1 ? &_neighbors_inner[i * _maxneighbors] : &_neighbors_border[ii * _maxneighbors]);
__syncthreads();
for(int jj = threadIdx.x; jj < jnum + blockDim.x; jj += blockDim.x) {
if(jj < jnum) {
fpair = F_F(0.0);
j = jlist[jj];
factor_lj = j < _nall ? F_F(1.0) : _special_lj[j / _nall];
if(coul_type != COUL_NONE)
factor_coul = j < _nall ? F_F(1.0) : _special_coul[j / _nall];
j = j < _nall ? j : j % _nall;
myxtype = fetchXType(j);
delx = xtmp - myxtype.x;
dely = ytmp - myxtype.y;
delz = ztmp - myxtype.z;
int jtype = static_cast <int>(myxtype.w);
const F_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
bool in_coul_cutoff;
if(in_cutoff) {
switch(pair_type) {
case PAIR_BORN:
fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_BUCK:
fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_CG_CMM:
fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_CHARMM:
fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_CLASS2:
fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_CUT:
fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_EXPAND:
fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_GROMACS:
fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ_SMOOTH:
fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_LJ96_CUT:
fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_MORSE_R6:
fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
case PAIR_MORSE:
fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
break;
}
}
if(coul_type != COUL_NONE) {
const F_CFLOAT qiqj = qtmp * fetchQ(j);
if(qiqj * qiqj > (1e-8f)) {
in_coul_cutoff =
rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
if(in_coul_cutoff) {
switch(coul_type) {
case COUL_CHARMM:
fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
break;
case COUL_CHARMM_IMPLICIT:
fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
break;
case COUL_GROMACS:
fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
break;
case COUL_LONG: {
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const F_CFLOAT r = _RSQRT_(r2inv);
const F_CFLOAT grij = _g_ewald * r;
const F_CFLOAT expm2 = _EXP_(-grij * grij);
const F_CFLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
const F_CFLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
const F_CFLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
F_CFLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
if(eflag) {
ecoul += prefactor * erfc;
if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
}
fpair += forcecoul * r2inv;
}
break;
case COUL_DEBYE: {
const F_CFLOAT r2inv = F_F(1.0) / rsq;
const X_CFLOAT r = _RSQRT_(r2inv);
const X_CFLOAT rinv = F_F(1.0) / r;
const F_CFLOAT screening = _EXP_(-_kappa * r);
F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
if(eflag) {
ecoul += forcecoul * rinv;
}
forcecoul *= (_kappa + rinv);
fpair += forcecoul * r2inv;
}
break;
case COUL_CUT: {
const F_CFLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
if(eflag) {
ecoul += forcecoul;
}
fpair += forcecoul * (F_F(1.0) / rsq);
}
break;
}
}
}
}
if(in_cutoff || in_coul_cutoff) {
F_CFLOAT dxfp, dyfp, dzfp;
partialForce.x += dxfp = delx * fpair;
partialForce.y += dyfp = dely * fpair;
partialForce.z += dzfp = delz * fpair;
if(vflag) {
partialVirial1.x += delx * dxfp;
partialVirial1.y += dely * dyfp;
partialVirial1.z += delz * dzfp;
partialVirial2.x += delx * dyfp;
partialVirial2.y += delx * dzfp;
partialVirial2.z += dely * dzfp;
}
}
}
}
if(eflag) {
sharedEnergy[threadIdx.x] = evdwl;
if(coul_type != COUL_NONE)
sharedEnergyCoul[threadIdx.x] = ecoul;
}
sharedForce[threadIdx.x] = partialForce;
if(vflag) {
sharedVirial1[threadIdx.x] = partialVirial1;
sharedVirial2[threadIdx.x] = partialVirial2;
}
__syncthreads();
for(unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
if(threadIdx.x < s) {
sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x;
sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y;
sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z;
if(vflag) {
sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x;
sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y;
sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z;
sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x;
sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y;
sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z;
}
if(eflag) {
sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ];
if(coul_type != COUL_NONE)
sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ];
}
}
__syncthreads();
}
if(threadIdx.x == 0) {
ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
if(eflag) {
ENERGY_CFLOAT tmp_evdwl;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0];
if(eflag_atom)
_eatom[i] = tmp_evdwl;
buffer = &buffer[gridDim.x * gridDim.y];
if(coul_type != COUL_NONE) {
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergyCoul[0];
if(eflag_atom)
_eatom[i] += tmp_evdwl;
buffer = &buffer[gridDim.x * gridDim.y];
}
}
if(vflag) {
ENERGY_CFLOAT tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
if(vflag_atom) _vatom[i + 0 * _nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].y;
if(vflag_atom) _vatom[i + 1 * _nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].z;
if(vflag_atom) _vatom[i + 2 * _nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].x;
if(vflag_atom) _vatom[i + 3 * _nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].y;
if(vflag_atom) _vatom[i + 4 * _nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].z;
if(vflag_atom) _vatom[i + 5 * _nmax] = tmp;
buffer = &buffer[6 * gridDim.x * gridDim.y];
}
F_CFLOAT* my_f;
if(_collect_forces_later) {
my_f = (F_CFLOAT*) buffer;
my_f += i;
*my_f = sharedForce[0].x;
my_f += _nmax;
*my_f = sharedForce[0].y;
my_f += _nmax;
*my_f = sharedForce[0].z;
} else {
my_f = _f + i;
*my_f += sharedForce[0].x;
my_f += _nmax;
*my_f += sharedForce[0].y;
my_f += _nmax;
*my_f += sharedForce[0].z;
}
}
}
__global__ void Pair_GenerateXType_Kernel()
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nall) {
X_CFLOAT4 xtype;
xtype.x = _x[i];
xtype.y = _x[i + _nmax];
xtype.z = _x[i + 2 * _nmax];
xtype.w = _type[i];
_x_type[i] = xtype;
}
}
__global__ void Pair_GenerateVRadius_Kernel()
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nall) {
V_CFLOAT4 vradius;
vradius.x = _v[i];
vradius.y = _v[i + _nmax];
vradius.z = _v[i + 2 * _nmax];
vradius.w = _radius[i];
_v_radius[i] = vradius;
}
}
__global__ void Pair_GenerateOmegaRmass_Kernel()
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nall) {
V_CFLOAT4 omegarmass;
omegarmass.x = _omega[i];
omegarmass.y = _omega[i + _nmax];
omegarmass.z = _omega[i + 2 * _nmax];
omegarmass.w = _rmass[i];
_omega_rmass[i] = omegarmass;
}
}
__global__ void Pair_RevertXType_Kernel()
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nall) {
X_CFLOAT4 xtype = _x_type[i];
_x[i] = xtype.x;
_x[i + _nmax] = xtype.y;
_x[i + 2 * _nmax] = xtype.z;
_type[i] = static_cast <int>(xtype.w);
}
}
__global__ void Pair_BuildXHold_Kernel()
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i < _nall) {
X_CFLOAT4 xtype = _x_type[i];
_xhold[i] = xtype.x;
_xhold[i + _nmax] = xtype.y;
_xhold[i + 2 * _nmax] = xtype.z;
}
}
__global__ void Pair_CollectForces_Kernel(int nperblock, int n)
{
int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if(i >= _nlocal) return;
ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
F_CFLOAT* buf_f = (F_CFLOAT*) &buf[nperblock * n];
F_CFLOAT* my_f = _f + i;
buf_f += i;
*my_f += * buf_f;
my_f += _nmax;
buf_f += _nmax;
*my_f += * buf_f;
my_f += _nmax;
buf_f += _nmax;
*my_f += * buf_f;
my_f += _nmax;
}