lammps/lib/cuda/cuda_pair_kernel.cu

1353 lines
40 KiB
Plaintext

/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
#define EWALD_F 1.12837917
#define EWALD_P 0.3275911
#define A1 0.254829592
#define A2 -0.284496736
#define A3 1.421413741
#define A4 -1.453152027
#define A5 1.061405429
inline __device__ int sbmask(int j) {
return j >> SBBITS & 3;
}
template <const PAIR_FORCES pair_type,const COUL_FORCES coul_type,const unsigned int extended_data>
__global__ void Pair_Kernel_TpA(int eflag, int vflag,int eflag_atom,int vflag_atom)
{
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
ENERGY_FLOAT ecoul = ENERGY_F(0.0);
ENERGY_FLOAT* sharedE;
ENERGY_FLOAT* sharedECoul;
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
if(eflag||eflag_atom)
{
sharedE = &sharedmem[threadIdx.x];
sharedE[0] = ENERGY_F(0.0);
sharedV += blockDim.x;
if(coul_type!=COUL_NONE)
{
sharedECoul = sharedE + blockDim.x;
sharedECoul[0] = ENERGY_F(0.0);
sharedV += blockDim.x;
}
}
if(vflag||vflag_atom)
{
sharedV[0*blockDim.x] = ENERGY_F(0.0);
sharedV[1*blockDim.x] = ENERGY_F(0.0);
sharedV[2*blockDim.x] = ENERGY_F(0.0);
sharedV[3*blockDim.x] = ENERGY_F(0.0);
sharedV[4*blockDim.x] = ENERGY_F(0.0);
sharedV[5*blockDim.x] = ENERGY_F(0.0);
}
int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
X_FLOAT xtmp,ytmp,ztmp;
X_FLOAT4 myxtype;
F_FLOAT fxtmp,fytmp,fztmp,fpair;
F_FLOAT delx,dely,delz;
F_FLOAT factor_lj,factor_coul;
F_FLOAT qtmp;
int itype,i,j;
int jnum=0;
int* jlist;
if(ii < _inum)
{
i = _ilist[ii];
myxtype=fetchXType(i);
xtmp=myxtype.x;
ytmp=myxtype.y;
ztmp=myxtype.z;
itype=static_cast <int> (myxtype.w);
fxtmp = F_F(0.0);
fytmp = F_F(0.0);
fztmp = F_F(0.0);
if(coul_type!=COUL_NONE)
qtmp = fetchQ(i);
jnum = _numneigh[i];
jlist = &_neighbors[i];
}
__syncthreads();
for (int jj = 0; jj < jnum; jj++)
{
if(ii < _inum)
if(jj<jnum)
{
fpair=F_F(0.0);
j = jlist[jj*_nlocal];
factor_lj = _special_lj[sbmask(j)];
if(coul_type!=COUL_NONE)
factor_coul = _special_coul[sbmask(j)];
j &= NEIGHMASK;
myxtype = fetchXType(j);
delx = xtmp - myxtype.x;
dely = ytmp - myxtype.y;
delz = ztmp - myxtype.z;
int jtype = static_cast <int> (myxtype.w);
const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0)? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
if (in_cutoff)
{
switch(pair_type)
{
case PAIR_BORN:
fpair += PairBornCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_BUCK:
fpair += PairBuckCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_CG_CMM:
fpair += PairCGCMMCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_CHARMM:
fpair += PairLJCharmmCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_CLASS2:
fpair += PairLJClass2Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_CUT:
fpair += PairLJCutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_EXPAND:
fpair += PairLJExpandCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_GROMACS:
fpair += PairLJGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_SMOOTH:
fpair += PairLJSmoothCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ96_CUT:
fpair += PairLJ96CutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_MORSE_R6:
fpair += PairMorseR6Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_MORSE:
fpair += PairMorseCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
}
}
if(coul_type!=COUL_NONE)
{
const F_FLOAT qiqj=qtmp*fetchQ(j);
if(qiqj*qiqj>1e-8)
{
const bool in_coul_cutoff =
rsq < (_cut_coulsq_global > X_F(0.0)? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
if (in_coul_cutoff)
{
switch(coul_type)
{
case COUL_CHARMM:
fpair += CoulCharmmCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj);
break;
case COUL_CHARMM_IMPLICIT:
fpair += CoulCharmmImplicitCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj);
break;
case COUL_CUT:
{
const F_FLOAT forcecoul = factor_coul*_qqrd2e* qiqj*_RSQRT_(rsq);
if(eflag)
{
ecoul += forcecoul;
}
fpair += forcecoul*(F_F(1.0)/rsq);
}
break;
case COUL_DEBYE:
{
const F_FLOAT r2inv = F_F(1.0)/rsq;
const X_FLOAT r = _RSQRT_(r2inv);
const X_FLOAT rinv = F_F(1.0)/r;
const F_FLOAT screening = _EXP_(-_kappa*r);
F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
if(eflag)
{
ecoul += forcecoul*rinv;
}
forcecoul *= (_kappa + rinv);
fpair += forcecoul*r2inv;
}
break;
case COUL_GROMACS:
fpair += CoulGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_coul,eflag,ecoul,qiqj);
break;
case COUL_LONG:
{
const F_FLOAT r2inv = F_F(1.0)/rsq;
const F_FLOAT r = _RSQRT_(r2inv);
const F_FLOAT grij = _g_ewald * r;
const F_FLOAT expm2 = _EXP_(-grij*grij);
const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P*grij);
const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
const F_FLOAT prefactor = _qqrd2e* qiqj*(F_F(1.0)/r);
F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
if(eflag)
{
ecoul += prefactor*erfc;
if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
}
fpair += forcecoul*r2inv;
}
break;
}
}
in_cutoff=in_cutoff || in_coul_cutoff;
}
}
if (in_cutoff)
{
F_FLOAT dxfp,dyfp,dzfp;
fxtmp += dxfp = delx*fpair;
fytmp += dyfp = dely*fpair;
fztmp += dzfp = delz*fpair;
if(vflag)
{
sharedV[0 * blockDim.x]+= delx*dxfp;
sharedV[1 * blockDim.x]+= dely*dyfp;
sharedV[2 * blockDim.x]+= delz*dzfp;
sharedV[3 * blockDim.x]+= delx*dyfp;
sharedV[4 * blockDim.x]+= delx*dzfp;
sharedV[5 * blockDim.x]+= dely*dzfp;
}
}
}
}
__syncthreads();
if(ii < _inum)
{
F_FLOAT* my_f;
if(_collect_forces_later)
{
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
if(eflag)
{
buffer=&buffer[1 * gridDim.x * gridDim.y];
if(coul_type!=COUL_NONE)
buffer=&buffer[1 * gridDim.x * gridDim.y];
}
if(vflag)
{
buffer=&buffer[6 * gridDim.x * gridDim.y];
}
my_f = (F_FLOAT*) buffer;
my_f += i;
*my_f = fxtmp; my_f += _nmax;
*my_f = fytmp; my_f += _nmax;
*my_f = fztmp;
}
else
{
my_f = _f + i;
*my_f += fxtmp; my_f += _nmax;
*my_f += fytmp; my_f += _nmax;
*my_f += fztmp;
}
}
__syncthreads();
if(eflag)
{
sharedE[0] = evdwl;
if(coul_type!=COUL_NONE)
sharedECoul[0] = ecoul;
}
if(eflag_atom && i<_nlocal)
{
if(coul_type!=COUL_NONE)
_eatom[i] += evdwl + ecoul;
else
_eatom[i] += evdwl;
}
if(vflag_atom && i<_nlocal)
{
_vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
_vatom[i+_nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
_vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
_vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
_vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
_vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
}
if(vflag||eflag) PairVirialCompute_A_Kernel(eflag,vflag,coul_type!=COUL_NONE?1:0);
}
template <const PAIR_FORCES pair_type,const COUL_FORCES coul_type,const unsigned int extended_data>
__global__ void Pair_Kernel_BpA(int eflag, int vflag,int eflag_atom,int vflag_atom)
{
int ii = (blockIdx.x*gridDim.y+blockIdx.y);
if( ii >= _inum )
return;
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
ENERGY_FLOAT ecoul = ENERGY_F(0.0);
F_FLOAT3* sharedVirial1;
F_FLOAT3* sharedVirial2;
F_FLOAT* sharedEnergy;
F_FLOAT* sharedEnergyCoul;
F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0];
if(vflag)
{
sharedVirial1 = &sharedForce[64];
sharedVirial2 = &sharedVirial1[64];
}
else
{
sharedVirial1 = &sharedForce[0];
sharedVirial2 = &sharedVirial1[0];
}
if(eflag)
{
if(vflag||vflag_atom)
sharedEnergy = (F_FLOAT*) &sharedVirial2[64];
else
sharedEnergy = (F_FLOAT*) &sharedForce[64];
if(coul_type!=COUL_NONE)
sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64];
}
F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) };
X_FLOAT xtmp,ytmp,ztmp;
X_FLOAT4 myxtype;
F_FLOAT delx,dely,delz;
F_FLOAT factor_lj,factor_coul;
F_FLOAT fpair;
F_FLOAT qtmp;
int itype,jnum,i,j;
int* jlist;
i = _ilist[ii];
myxtype = fetchXType(i);
xtmp=myxtype.x;
ytmp=myxtype.y;
ztmp=myxtype.z;
itype=static_cast <int> (myxtype.w);
if(coul_type!=COUL_NONE)
qtmp = fetchQ(i);
jnum = _numneigh[i];
jlist = &_neighbors[i*_maxneighbors];
__syncthreads();
for (int jj = threadIdx.x; jj < jnum+blockDim.x; jj+=blockDim.x)
{
if(jj<jnum)
{
fpair=F_F(0.0);
j = jlist[jj];
factor_lj = _special_lj[sbmask(j)];
if(coul_type!=COUL_NONE)
factor_coul = _special_coul[sbmask(j)];
j &= NEIGHMASK;
myxtype = fetchXType(j);
delx = xtmp - myxtype.x;
dely = ytmp - myxtype.y;
delz = ztmp - myxtype.z;
int jtype = static_cast <int> (myxtype.w);
const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0)? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
bool in_coul_cutoff;
if (in_cutoff)
{
switch(pair_type)
{
case PAIR_BORN:
fpair += PairBornCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_BUCK:
fpair += PairBuckCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_CG_CMM:
fpair += PairCGCMMCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_CHARMM:
fpair += PairLJCharmmCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_CLASS2:
fpair += PairLJClass2Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_CUT:
fpair += PairLJCutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_EXPAND:
fpair += PairLJExpandCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_GROMACS:
fpair += PairLJGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_SMOOTH:
fpair += PairLJSmoothCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ96_CUT:
fpair += PairLJ96CutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_MORSE_R6:
fpair += PairMorseR6Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_MORSE:
fpair += PairMorseCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
}
}
if(coul_type!=COUL_NONE)
{
const F_FLOAT qiqj=qtmp*fetchQ(j);
if(qiqj*qiqj>(1e-8f))
{
in_coul_cutoff =
rsq < (_cut_coulsq_global > X_F(0.0)? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
if (in_coul_cutoff)
{
switch(coul_type)
{
case COUL_CHARMM:
fpair += CoulCharmmCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj);
break;
case COUL_CHARMM_IMPLICIT:
fpair += CoulCharmmImplicitCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj);
break;
case COUL_GROMACS:
fpair += CoulGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_coul,eflag,ecoul,qiqj);
break;
case COUL_LONG:
{
const F_FLOAT r2inv = F_F(1.0)/rsq;
const F_FLOAT r = _RSQRT_(r2inv);
const F_FLOAT grij = _g_ewald * r;
const F_FLOAT expm2 = _EXP_(-grij*grij);
const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P*grij);
const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
const F_FLOAT prefactor = _qqrd2e* qiqj*(F_F(1.0)/r);
F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
if(eflag)
{
ecoul += prefactor*erfc;
if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
}
fpair += forcecoul*r2inv;
}
break;
case COUL_DEBYE:
{
const F_FLOAT r2inv = F_F(1.0)/rsq;
const X_FLOAT r = _RSQRT_(r2inv);
const X_FLOAT rinv = F_F(1.0)/r;
const F_FLOAT screening = _EXP_(-_kappa*r);
F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
if(eflag)
{
ecoul += forcecoul*rinv;
}
forcecoul *= (_kappa + rinv);
fpair += forcecoul*r2inv;
}
break;
case COUL_CUT:
{
const F_FLOAT forcecoul = factor_coul*_qqrd2e* qiqj*_RSQRT_(rsq);
if(eflag)
{
ecoul += forcecoul;
}
fpair += forcecoul*(F_F(1.0)/rsq);
}
break;
}
}
}
}
if (in_cutoff||in_coul_cutoff)
{
F_FLOAT dxfp,dyfp,dzfp;
partialForce.x += dxfp = delx*fpair;
partialForce.y += dyfp = dely*fpair;
partialForce.z += dzfp = delz*fpair;
if(vflag)
{
partialVirial1.x+= delx*dxfp;
partialVirial1.y+= dely*dyfp;
partialVirial1.z+= delz*dzfp;
partialVirial2.x+= delx*dyfp;
partialVirial2.y+= delx*dzfp;
partialVirial2.z+= dely*dzfp;
}
}
}
}
if(eflag)
{
sharedEnergy[threadIdx.x]= evdwl;
if(coul_type!=COUL_NONE)
sharedEnergyCoul[threadIdx.x]= ecoul;
}
sharedForce[threadIdx.x]=partialForce;
if(vflag)
{
sharedVirial1[threadIdx.x]=partialVirial1;
sharedVirial2[threadIdx.x]=partialVirial2;
}
__syncthreads();
for( unsigned int s = blockDim.x >> 1; s > 0; s >>= 1 )
{
if( threadIdx.x < s )
{
sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x;
sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y;
sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z;
if(vflag)
{
sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x;
sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y;
sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z;
sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x;
sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y;
sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z;
}
if(eflag)
{
sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ];
if(coul_type!=COUL_NONE)
sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ];
}
}
__syncthreads();
}
if(threadIdx.x == 0)
{
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
if(eflag)
{
ENERGY_FLOAT tmp_evdwl;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]=tmp_evdwl=ENERGY_F(0.5) * sharedEnergy[0];
if(eflag_atom)
_eatom[i] = tmp_evdwl;
buffer=&buffer[gridDim.x * gridDim.y];
if(coul_type!=COUL_NONE)
{
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]=tmp_evdwl=ENERGY_F(0.5) * sharedEnergyCoul[0];
if(eflag_atom)
_eatom[i] += tmp_evdwl;
buffer=&buffer[gridDim.x * gridDim.y];
}
}
if(vflag)
{
ENERGY_FLOAT tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
if(vflag_atom) _vatom[i+0*_nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].y;
if(vflag_atom) _vatom[i+1*_nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].z;
if(vflag_atom) _vatom[i+2*_nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].x;
if(vflag_atom) _vatom[i+3*_nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].y;
if(vflag_atom) _vatom[i+4*_nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].z;
if(vflag_atom) _vatom[i+5*_nmax] = tmp;
buffer=&buffer[6 * gridDim.x * gridDim.y];
}
F_FLOAT* my_f;
if(_collect_forces_later)
{
my_f = (F_FLOAT*) buffer;
my_f += i;
*my_f = sharedForce[0].x; my_f += _nmax;
*my_f = sharedForce[0].y; my_f += _nmax;
*my_f = sharedForce[0].z;
}
else
{
my_f = _f + i;
*my_f += sharedForce[0].x; my_f += _nmax;
*my_f += sharedForce[0].y; my_f += _nmax;
*my_f += sharedForce[0].z;
}
}
}
template <const PAIR_FORCES pair_type,const COUL_FORCES coul_type,const unsigned int extended_data>
__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag,int eflag_atom,int vflag_atom, int comm_phase)
{
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
ENERGY_FLOAT ecoul = ENERGY_F(0.0);
ENERGY_FLOAT* sharedE;
ENERGY_FLOAT* sharedECoul;
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
if(eflag||eflag_atom)
{
sharedE = &sharedmem[threadIdx.x];
sharedE[0] = ENERGY_F(0.0);
sharedV += blockDim.x;
if(coul_type!=COUL_NONE)
{
sharedECoul = sharedE + blockDim.x;
sharedECoul[0] = ENERGY_F(0.0);
sharedV += blockDim.x;
}
}
if(vflag||vflag_atom)
{
sharedV[0*blockDim.x] = ENERGY_F(0.0);
sharedV[1*blockDim.x] = ENERGY_F(0.0);
sharedV[2*blockDim.x] = ENERGY_F(0.0);
sharedV[3*blockDim.x] = ENERGY_F(0.0);
sharedV[4*blockDim.x] = ENERGY_F(0.0);
sharedV[5*blockDim.x] = ENERGY_F(0.0);
}
int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
X_FLOAT xtmp,ytmp,ztmp;
X_FLOAT4 myxtype;
F_FLOAT fxtmp,fytmp,fztmp,fpair;
F_FLOAT delx,dely,delz;
F_FLOAT factor_lj,factor_coul;
F_FLOAT qtmp;
int itype,i,j;
int jnum=0;
int* jlist;
if(ii < (comm_phase<2?_inum:_inum_border[0]))
{
i = comm_phase<2? _ilist[ii] : _ilist_border[ii] ;
myxtype=fetchXType(i);
myxtype=_x_type[i];
xtmp=myxtype.x;
ytmp=myxtype.y;
ztmp=myxtype.z;
itype=static_cast <int> (myxtype.w);
fxtmp = F_F(0.0);
fytmp = F_F(0.0);
fztmp = F_F(0.0);
if(coul_type!=COUL_NONE)
qtmp = fetchQ(i);
jnum = comm_phase==0? _numneigh[i]: (comm_phase==1?_numneigh_inner[i]:_numneigh_border[ii]);
jlist = comm_phase==0? &_neighbors[i]: (comm_phase==1?&_neighbors_inner[i]:&_neighbors_border[ii]);
}
__syncthreads();
for (int jj = 0; jj < jnum; jj++)
{
if(ii < (comm_phase<2?_inum:_inum_border[0]))
if(jj<jnum)
{
fpair=F_F(0.0);
j = jlist[jj*_nlocal];
factor_lj = j<_nall ? F_F(1.0) : _special_lj[j/_nall];
if(coul_type!=COUL_NONE)
factor_coul = j<_nall ? F_F(1.0) : _special_coul[j/_nall];
j = j<_nall ? j : j % _nall;
myxtype = fetchXType(j);
delx = xtmp - myxtype.x;
dely = ytmp - myxtype.y;
delz = ztmp - myxtype.z;
int jtype = static_cast <int> (myxtype.w);
const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0)? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
if (in_cutoff)
{
switch(pair_type)
{
case PAIR_BORN:
fpair += PairBornCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_BUCK:
fpair += PairBuckCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_CG_CMM:
fpair += PairCGCMMCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_CHARMM:
fpair += PairLJCharmmCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_CLASS2:
fpair += PairLJClass2Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_CUT:
fpair += PairLJCutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_EXPAND:
fpair += PairLJExpandCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_GROMACS:
fpair += PairLJGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_SMOOTH:
fpair += PairLJSmoothCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ96_CUT:
fpair += PairLJ96CutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_MORSE_R6:
fpair += PairMorseR6Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_MORSE:
fpair += PairMorseCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
}
}
if(coul_type!=COUL_NONE)
{
const F_FLOAT qiqj=qtmp*fetchQ(j);
if(qiqj*qiqj>1e-8)
{
const bool in_coul_cutoff =
rsq < (_cut_coulsq_global > X_F(0.0)? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
if (in_coul_cutoff)
{
switch(coul_type)
{
case COUL_CHARMM:
fpair += CoulCharmmCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj);
break;
case COUL_CHARMM_IMPLICIT:
fpair += CoulCharmmImplicitCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj);
break;
case COUL_CUT:
{
const F_FLOAT forcecoul = factor_coul*_qqrd2e* qiqj*_RSQRT_(rsq);
if(eflag)
{
ecoul += forcecoul;
}
fpair += forcecoul*(F_F(1.0)/rsq);
}
break;
case COUL_DEBYE:
{
const F_FLOAT r2inv = F_F(1.0)/rsq;
const X_FLOAT r = _RSQRT_(r2inv);
const X_FLOAT rinv = F_F(1.0)/r;
const F_FLOAT screening = _EXP_(-_kappa*r);
F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
if(eflag)
{
ecoul += forcecoul*rinv;
}
forcecoul *= (_kappa + rinv);
fpair += forcecoul*r2inv;
}
break;
case COUL_GROMACS:
fpair += CoulGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_coul,eflag,ecoul,qiqj);
break;
case COUL_LONG:
{
const F_FLOAT r2inv = F_F(1.0)/rsq;
const F_FLOAT r = _RSQRT_(r2inv);
const F_FLOAT grij = _g_ewald * r;
const F_FLOAT expm2 = _EXP_(-grij*grij);
const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P*grij);
const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
const F_FLOAT prefactor = _qqrd2e* qiqj*(F_F(1.0)/r);
F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
if(eflag)
{
ecoul += prefactor*erfc;
if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
}
fpair += forcecoul*r2inv;
}
break;
}
}
in_cutoff=in_cutoff || in_coul_cutoff;
}
}
if (in_cutoff)
{
F_FLOAT dxfp,dyfp,dzfp;
fxtmp += dxfp = delx*fpair;
fytmp += dyfp = dely*fpair;
fztmp += dzfp = delz*fpair;
if(vflag)
{
sharedV[0 * blockDim.x]+= delx*dxfp;
sharedV[1 * blockDim.x]+= dely*dyfp;
sharedV[2 * blockDim.x]+= delz*dzfp;
sharedV[3 * blockDim.x]+= delx*dyfp;
sharedV[4 * blockDim.x]+= delx*dzfp;
sharedV[5 * blockDim.x]+= dely*dzfp;
}
}
}
}
__syncthreads();
if(ii < (comm_phase<2?_inum:_inum_border[0]))
{
F_FLOAT* my_f;
if(_collect_forces_later)
{
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
if(eflag)
{
buffer=&buffer[1 * gridDim.x * gridDim.y];
if(coul_type!=COUL_NONE)
buffer=&buffer[1 * gridDim.x * gridDim.y];
}
if(vflag)
{
buffer=&buffer[6 * gridDim.x * gridDim.y];
}
my_f = (F_FLOAT*) buffer;
my_f += i;
*my_f = fxtmp; my_f += _nmax;
*my_f = fytmp; my_f += _nmax;
*my_f = fztmp;
}
else
{
my_f = _f + i;
*my_f += fxtmp; my_f += _nmax;
*my_f += fytmp; my_f += _nmax;
*my_f += fztmp;
}
}
__syncthreads();
if(eflag)
{
sharedE[0] = evdwl;
if(coul_type!=COUL_NONE)
sharedECoul[0] = ecoul;
}
if(eflag_atom && i<_nlocal)
{
if(coul_type!=COUL_NONE)
_eatom[i] += evdwl + ecoul;
else
_eatom[i] += evdwl;
}
if(vflag_atom && i<_nlocal)
{
_vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
_vatom[i+_nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
_vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
_vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
_vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
_vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
}
if(vflag||eflag) PairVirialCompute_A_Kernel(eflag,vflag,coul_type!=COUL_NONE?1:0);
}
template <const PAIR_FORCES pair_type,const COUL_FORCES coul_type,const unsigned int extended_data>
__global__ void Pair_Kernel_BpA_opt(int eflag, int vflag,int eflag_atom,int vflag_atom, int comm_phase)
{
int ii = (blockIdx.x*gridDim.y+blockIdx.y);
if( ii >= (comm_phase<2?_inum:_inum_border[0]))
return;
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
ENERGY_FLOAT ecoul = ENERGY_F(0.0);
F_FLOAT3* sharedVirial1;
F_FLOAT3* sharedVirial2;
F_FLOAT* sharedEnergy;
F_FLOAT* sharedEnergyCoul;
F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0];
if(vflag)
{
sharedVirial1 = &sharedForce[64];
sharedVirial2 = &sharedVirial1[64];
}
else
{
sharedVirial1 = &sharedForce[0];
sharedVirial2 = &sharedVirial1[0];
}
if(eflag)
{
if(vflag||vflag_atom)
sharedEnergy = (F_FLOAT*) &sharedVirial2[64];
else
sharedEnergy = (F_FLOAT*) &sharedForce[64];
if(coul_type!=COUL_NONE)
sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64];
}
F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) };
F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) };
X_FLOAT xtmp,ytmp,ztmp;
X_FLOAT4 myxtype;
F_FLOAT delx,dely,delz;
F_FLOAT factor_lj,factor_coul;
F_FLOAT fpair;
F_FLOAT qtmp;
int itype,jnum,i,j;
int* jlist;
i = comm_phase<2? _ilist[ii] : _ilist_border[ii];
myxtype = fetchXType(i);
xtmp=myxtype.x;
ytmp=myxtype.y;
ztmp=myxtype.z;
itype=static_cast <int> (myxtype.w);
if(coul_type!=COUL_NONE)
qtmp = fetchQ(i);
jnum = comm_phase==0? _numneigh[i]: (comm_phase==1?_numneigh_inner[i]:_numneigh_border[ii]);
jlist = comm_phase==0? &_neighbors[i*_maxneighbors]: (comm_phase==1?&_neighbors_inner[i*_maxneighbors]:&_neighbors_border[ii*_maxneighbors]);
__syncthreads();
for (int jj = threadIdx.x; jj < jnum+blockDim.x; jj+=blockDim.x)
{
if(jj<jnum)
{
fpair=F_F(0.0);
j = jlist[jj];
factor_lj = j<_nall ? F_F(1.0) : _special_lj[j/_nall];
if(coul_type!=COUL_NONE)
factor_coul = j<_nall ? F_F(1.0) : _special_coul[j/_nall];
j = j<_nall ? j : j % _nall;
myxtype = fetchXType(j);
delx = xtmp - myxtype.x;
dely = ytmp - myxtype.y;
delz = ztmp - myxtype.z;
int jtype = static_cast <int> (myxtype.w);
const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
bool in_cutoff = rsq < (_cutsq_global > X_F(0.0)? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
bool in_coul_cutoff;
if (in_cutoff)
{
switch(pair_type)
{
case PAIR_BORN:
fpair += PairBornCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_BUCK:
fpair += PairBuckCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_CG_CMM:
fpair += PairCGCMMCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_CHARMM:
fpair += PairLJCharmmCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_CLASS2:
fpair += PairLJClass2Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_CUT:
fpair += PairLJCutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_EXPAND:
fpair += PairLJExpandCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_GROMACS:
fpair += PairLJGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ_SMOOTH:
fpair += PairLJSmoothCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_LJ96_CUT:
fpair += PairLJ96CutCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_MORSE_R6:
fpair += PairMorseR6Cuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
case PAIR_MORSE:
fpair += PairMorseCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_lj,eflag,evdwl);
break;
}
}
if(coul_type!=COUL_NONE)
{
const F_FLOAT qiqj=qtmp*fetchQ(j);
if(qiqj*qiqj>(1e-8f))
{
in_coul_cutoff =
rsq < (_cut_coulsq_global > X_F(0.0)? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
if (in_coul_cutoff)
{
switch(coul_type)
{
case COUL_CHARMM:
fpair += CoulCharmmCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj);
break;
case COUL_CHARMM_IMPLICIT:
fpair += CoulCharmmImplicitCuda_Eval(rsq,factor_coul,eflag,ecoul,qiqj);
break;
case COUL_GROMACS:
fpair += CoulGromacsCuda_Eval(rsq,itype * _cuda_ntypes + jtype,factor_coul,eflag,ecoul,qiqj);
break;
case COUL_LONG:
{
const F_FLOAT r2inv = F_F(1.0)/rsq;
const F_FLOAT r = _RSQRT_(r2inv);
const F_FLOAT grij = _g_ewald * r;
const F_FLOAT expm2 = _EXP_(-grij*grij);
const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P*grij);
const F_FLOAT erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
const F_FLOAT prefactor = _qqrd2e* qiqj*(F_F(1.0)/r);
F_FLOAT forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
if(eflag)
{
ecoul += prefactor*erfc;
if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
}
fpair += forcecoul*r2inv;
}
break;
case COUL_DEBYE:
{
const F_FLOAT r2inv = F_F(1.0)/rsq;
const X_FLOAT r = _RSQRT_(r2inv);
const X_FLOAT rinv = F_F(1.0)/r;
const F_FLOAT screening = _EXP_(-_kappa*r);
F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
if(eflag)
{
ecoul += forcecoul*rinv;
}
forcecoul *= (_kappa + rinv);
fpair += forcecoul*r2inv;
}
break;
case COUL_CUT:
{
const F_FLOAT forcecoul = factor_coul*_qqrd2e* qiqj*_RSQRT_(rsq);
if(eflag)
{
ecoul += forcecoul;
}
fpair += forcecoul*(F_F(1.0)/rsq);
}
break;
}
}
}
}
if (in_cutoff||in_coul_cutoff)
{
F_FLOAT dxfp,dyfp,dzfp;
partialForce.x += dxfp = delx*fpair;
partialForce.y += dyfp = dely*fpair;
partialForce.z += dzfp = delz*fpair;
if(vflag)
{
partialVirial1.x+= delx*dxfp;
partialVirial1.y+= dely*dyfp;
partialVirial1.z+= delz*dzfp;
partialVirial2.x+= delx*dyfp;
partialVirial2.y+= delx*dzfp;
partialVirial2.z+= dely*dzfp;
}
}
}
}
if(eflag)
{
sharedEnergy[threadIdx.x]= evdwl;
if(coul_type!=COUL_NONE)
sharedEnergyCoul[threadIdx.x]= ecoul;
}
sharedForce[threadIdx.x]=partialForce;
if(vflag)
{
sharedVirial1[threadIdx.x]=partialVirial1;
sharedVirial2[threadIdx.x]=partialVirial2;
}
__syncthreads();
for( unsigned int s = blockDim.x >> 1; s > 0; s >>= 1 )
{
if( threadIdx.x < s )
{
sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x;
sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y;
sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z;
if(vflag)
{
sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x;
sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y;
sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z;
sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x;
sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y;
sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z;
}
if(eflag)
{
sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ];
if(coul_type!=COUL_NONE)
sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ];
}
}
__syncthreads();
}
if(threadIdx.x == 0)
{
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
if(eflag)
{
ENERGY_FLOAT tmp_evdwl;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]=tmp_evdwl=ENERGY_F(0.5) * sharedEnergy[0];
if(eflag_atom)
_eatom[i] = tmp_evdwl;
buffer=&buffer[gridDim.x * gridDim.y];
if(coul_type!=COUL_NONE)
{
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]=tmp_evdwl=ENERGY_F(0.5) * sharedEnergyCoul[0];
if(eflag_atom)
_eatom[i] += tmp_evdwl;
buffer=&buffer[gridDim.x * gridDim.y];
}
}
if(vflag)
{
ENERGY_FLOAT tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
if(vflag_atom) _vatom[i+0*_nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].y;
if(vflag_atom) _vatom[i+1*_nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial1[0].z;
if(vflag_atom) _vatom[i+2*_nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].x;
if(vflag_atom) _vatom[i+3*_nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].y;
if(vflag_atom) _vatom[i+4*_nmax] = tmp;
buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y]= tmp = ENERGY_F(0.5) * sharedVirial2[0].z;
if(vflag_atom) _vatom[i+5*_nmax] = tmp;
buffer=&buffer[6 * gridDim.x * gridDim.y];
}
F_FLOAT* my_f;
if(_collect_forces_later)
{
my_f = (F_FLOAT*) buffer;
my_f += i;
*my_f = sharedForce[0].x; my_f += _nmax;
*my_f = sharedForce[0].y; my_f += _nmax;
*my_f = sharedForce[0].z;
}
else
{
my_f = _f + i;
*my_f += sharedForce[0].x; my_f += _nmax;
*my_f += sharedForce[0].y; my_f += _nmax;
*my_f += sharedForce[0].z;
}
}
}
__global__ void Pair_GenerateXType_Kernel()
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nall)
{
X_FLOAT4 xtype;
xtype.x=_x[i];
xtype.y=_x[i+_nmax];
xtype.z=_x[i+2*_nmax];
xtype.w=_type[i];
_x_type[i]=xtype;
}
}
__global__ void Pair_GenerateVRadius_Kernel()
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nall)
{
V_FLOAT4 vradius;
vradius.x=_v[i];
vradius.y=_v[i+_nmax];
vradius.z=_v[i+2*_nmax];
vradius.w=_radius[i];
_v_radius[i]=vradius;
}
}
__global__ void Pair_GenerateOmegaRmass_Kernel()
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nall)
{
V_FLOAT4 omegarmass;
omegarmass.x=_omega[i];
omegarmass.y=_omega[i+_nmax];
omegarmass.z=_omega[i+2*_nmax];
omegarmass.w=_rmass[i];
_omega_rmass[i]=omegarmass;
}
}
__global__ void Pair_RevertXType_Kernel()
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nall)
{
X_FLOAT4 xtype=_x_type[i];
_x[i]=xtype.x;
_x[i+_nmax]=xtype.y;
_x[i+2*_nmax]=xtype.z;
_type[i]=static_cast <int> (xtype.w);
}
}
__global__ void Pair_BuildXHold_Kernel()
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i < _nall)
{
X_FLOAT4 xtype=_x_type[i];
_xhold[i]=xtype.x;
_xhold[i+_nmax]=xtype.y;
_xhold[i+2*_nmax]=xtype.z;
}
}
__global__ void Pair_CollectForces_Kernel(int nperblock,int n)
{
int i = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i>=_nlocal) return;
ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
F_FLOAT* buf_f = (F_FLOAT*) &buf[nperblock * n];
F_FLOAT* my_f = _f + i;
buf_f += i;
*my_f += * buf_f; my_f+=_nmax; buf_f+=_nmax;
*my_f += * buf_f; my_f+=_nmax; buf_f+=_nmax;
*my_f += * buf_f; my_f+=_nmax;
}