lammps/lib/cuda/pair_gran_hooke_cuda_kernel...

220 lines
5.9 KiB
Plaintext

/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
__global__ void PairGranHookeCuda_Kernel(int eflag, int vflag,int eflag_atom,int vflag_atom,int** firstneight,int* binned_id
,F_FLOAT kn,F_FLOAT gamman,F_FLOAT gammat, F_FLOAT xmu)
{
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
ENERGY_FLOAT* sharedE;
ENERGY_FLOAT* sharedV;
if(eflag||eflag_atom)
{
sharedE = &sharedmem[threadIdx.x];
sharedV = &sharedmem[0];
sharedE[0] = ENERGY_F(0.0); sharedV+=blockDim.x;
}
if(vflag||vflag_atom)
{
sharedV += threadIdx.x;
sharedV[0*blockDim.x] = ENERGY_F(0.0);
sharedV[1*blockDim.x] = ENERGY_F(0.0);
sharedV[2*blockDim.x] = ENERGY_F(0.0);
sharedV[3*blockDim.x] = ENERGY_F(0.0);
sharedV[4*blockDim.x] = ENERGY_F(0.0);
sharedV[5*blockDim.x] = ENERGY_F(0.0);
}
int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
MYEMUDBG( if(ii==0) printf("# CUDA: PairGranHookeCuda_Kernel: -- no binning --\n"); )
X_FLOAT xtmp,ytmp,ztmp;
X_FLOAT4 myxtype;
V_FLOAT4 myvradius, ovradius;
F_FLOAT fxtmp,fytmp,fztmp,torquextmp,torqueytmp,torqueztmp;
F_FLOAT delx,dely,delz;
F_FLOAT radi,radj,radsum,r,rsqinv;
F_FLOAT vr1,vr2,vr3,vnnr,vn1,vn2,vn3,vt1,vt2,vt3;
F_FLOAT wr1,wr2,wr3;
F_FLOAT vtr1,vtr2,vtr3,vrel;
F_FLOAT meff,damp,ccel,tor1,tor2,tor3;
F_FLOAT fn,fs,ft,fs1,fs2,fs3;
int jnum =0;
int i,j;
int* jlist;
if(ii < _inum)
{
i = _ilist[ii];
myxtype = fetchXType(i);
myvradius = fetchVRadius(i);
xtmp=myxtype.x;
ytmp=myxtype.y;
ztmp=myxtype.z;
radi = myvradius.w;
fxtmp = F_F(0.0);
fytmp = F_F(0.0);
fztmp = F_F(0.0);
torquextmp = F_F(0.0);
torqueytmp = F_F(0.0);
torqueztmp = F_F(0.0);
jnum = _numneigh[i];
jlist = &_neighbors[i];
}
__syncthreads();
for (int jj = 0; jj < jnum; jj++)
{
if(ii < _inum)
if(jj<jnum)
{
j = jlist[jj*_nlocal];
myxtype = fetchXType(j);
ovradius = fetchVRadius(j);
delx = xtmp - myxtype.x;
dely = ytmp - myxtype.y;
delz = ztmp - myxtype.z;
radj = ovradius.w;
radsum = radi + radj;
const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
if (rsq < radsum*radsum)
{
const F_FLOAT rinv = _RSQRT_(rsq);
r = F_F(1.0)/rinv;
rsqinv = F_F(1.0)/rsq;
// relative translational velocity
vr1 = myvradius.x - ovradius.x;
vr2 = myvradius.y - ovradius.y;
vr3 = myvradius.z - ovradius.z;
// normal component
vnnr = vr1*delx + vr2*dely + vr3*delz;
vn1 = delx*vnnr * rsqinv;
vn2 = dely*vnnr * rsqinv;
vn3 = delz*vnnr * rsqinv;
// tangential component
vt1 = vr1 - vn1;
vt2 = vr2 - vn2;
vt3 = vr3 - vn3;
// relative rotational velocity
V_FLOAT4 omegarmass_i=fetchOmegaRmass(i);
V_FLOAT4 omegarmass_j=fetchOmegaRmass(j);
wr1 = (radi*omegarmass_i.x + radj*omegarmass_j.x) * rinv;
wr2 = (radi*omegarmass_i.y + radj*omegarmass_j.y) * rinv;
wr3 = (radi*omegarmass_i.z + radj*omegarmass_j.z) * rinv;
meff = omegarmass_i.w*omegarmass_j.w / (omegarmass_i.w+omegarmass_j.w);
if (_mask[i] & _freeze_group_bit) meff = omegarmass_j.w;
if (_mask[j] & _freeze_group_bit) meff = omegarmass_i.w;
damp = meff*gamman*vnnr*rsqinv;
ccel = kn*(radsum-r)*rinv - damp;
vtr1 = vt1 - (delz*wr2-dely*wr3);
vtr2 = vt2 - (delx*wr3-delz*wr1);
vtr3 = vt3 - (dely*wr1-delx*wr2);
vrel = vtr1*vtr1 + vtr2*vtr2 + vtr3*vtr3;
vrel = _SQRT_(vrel);
fn = xmu * fabs(ccel*r);
fs = meff*gammat*vrel;
ft = (vrel != F_F(0.0))?MIN(fn,fs) / vrel:F_F(0.0);
fs1 = -ft*vtr1;
fs2 = -ft*vtr2;
fs3 = -ft*vtr3;
F_FLOAT dxfp,dyfp,dzfp;
fxtmp += dxfp = delx*ccel + fs1;
fytmp += dyfp = dely*ccel + fs2;
fztmp += dzfp = delz*ccel + fs3;
tor1 = rinv * (dely*fs3 - delz*fs2);
tor2 = rinv * (delz*fs1 - delx*fs3);
tor3 = rinv * (delx*fs2 - dely*fs1);
torquextmp -= radi*tor1;
torqueytmp -= radi*tor2;
torqueztmp -= radi*tor3;
if(vflag)
{
sharedV[0 * blockDim.x]+= delx*dxfp;
sharedV[1 * blockDim.x]+= dely*dyfp;
sharedV[2 * blockDim.x]+= delz*dzfp;
sharedV[3 * blockDim.x]+= delx*dyfp;
sharedV[4 * blockDim.x]+= delx*dzfp;
sharedV[5 * blockDim.x]+= dely*dzfp;
}
}
}
}
__syncthreads();
if(ii < _inum)
{
F_FLOAT* my_f = _f + i;
*my_f += fxtmp; my_f += _nmax;
*my_f += fytmp; my_f += _nmax;
*my_f += fztmp;
F_FLOAT* my_torque = _torque + i;
*my_torque += torquextmp; my_torque += _nmax;
*my_torque += torqueytmp; my_torque += _nmax;
*my_torque += torqueztmp;
}
__syncthreads();
if(eflag) sharedE[0] = evdwl;
if(eflag_atom && i<_nlocal) _eatom[i] += evdwl;
if(vflag_atom && i<_nlocal)
{
_vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
_vatom[i+_nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
_vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
_vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
_vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
_vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
}
if(vflag||eflag) PairVirialCompute_A_Kernel(eflag,vflag,0);
}