lammps/lib/cuda/pair_eam_cuda_kernel_nc.cu

341 lines
8.4 KiB
Plaintext

/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
Original Version:
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
See the README file in the top-level LAMMPS directory.
-----------------------------------------------------------------------
USER-CUDA Package and associated modifications:
https://sourceforge.net/projects/lammpscuda/
Christian Trott, christian.trott@tu-ilmenau.de
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
Theoretical Physics II, University of Technology Ilmenau, Germany
See the README file in the USER-CUDA directory.
This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
static __device__ inline F_FLOAT4 fetchRhor(int i)
{
#ifdef CUDA_USE_TEXTURE
#if F_PRECISION == 1
return tex1Dfetch(_rhor_spline_tex,i);
#else
return tex1Dfetch_double_f(_rhor_spline_tex,i);
#endif
#else
return _rhor_spline[i];
#endif
}
static __device__ inline F_FLOAT4 fetchZ2r(int i)
{
#ifdef CUDA_USE_TEXTURE
#if F_PRECISION == 1
return tex1Dfetch(_z2r_spline_tex,i);
#else
return tex1Dfetch_double_f(_z2r_spline_tex,i);
#endif
#else
return _z2r_spline[i];
#endif
}
__global__ void PairEAMCuda_Kernel1(int eflag, int vflag,int eflag_atom,int vflag_atom)
{
ENERGY_FLOAT* sharedE;
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
if(eflag||eflag_atom)
{
sharedE = &sharedmem[threadIdx.x];
sharedE[0] = ENERGY_F(0.0);
sharedV += blockDim.x;
}
if(vflag||vflag_atom)
{
sharedV[0*blockDim.x] = ENERGY_F(0.0);
sharedV[1*blockDim.x] = ENERGY_F(0.0);
sharedV[2*blockDim.x] = ENERGY_F(0.0);
sharedV[3*blockDim.x] = ENERGY_F(0.0);
sharedV[4*blockDim.x] = ENERGY_F(0.0);
sharedV[5*blockDim.x] = ENERGY_F(0.0);
}
int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
X_FLOAT xtmp,ytmp,ztmp;
X_FLOAT4 myxtype;
F_FLOAT delx,dely,delz;
int itype;
int i=_nlocal;
int jnum=0;
int* jlist;
if(ii < _inum)
{
i = _ilist[ii];
myxtype=fetchXType(i);
xtmp=myxtype.x;
ytmp=myxtype.y;
ztmp=myxtype.z;
itype=static_cast <int> (myxtype.w);
jnum = _numneigh[i];
jlist = &_neighbors[i];
if(i<_nlocal)
_rho[i]=F_F(0.0);
}
__syncthreads();
for (int jj = 0; jj < jnum; jj++)
{
if(ii < _inum)
if(jj<jnum)
{
const int j = jlist[jj*_nlocal];
myxtype = fetchXType(j);
delx = xtmp - myxtype.x;
dely = ytmp - myxtype.y;
delz = ztmp - myxtype.z;
int jtype = static_cast <int> (myxtype.w);
const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
if (rsq < _cutsq_global)
{
F_FLOAT p = sqrt(rsq)*_rdr + F_F(1.0);
int m = static_cast<int> (p);
m = MIN(m,_nr-1);
p -= m;
p = MIN(p,F_F(1.0));
int k=(static_cast <int> (_type2rhor[jtype*_cuda_ntypes+itype])*(_nr+1)+m)*2;
F_FLOAT4 c=fetchRhor(k+1);
_rho[i] += ((c.w*p+c.x)*p+c.y)*p+c.z;
}
}
}
if(ii < _inum)
{
F_FLOAT p = _rho[i]*_rdrho + F_F(1.0);
int m = static_cast<int> (p);
m = MAX(1,MIN(m,_nrho-1));
p -= m;
p = MIN(p,F_F(1.0));
F_FLOAT* coeff = &_frho_spline[(static_cast <int> (_type2frho[itype])*(_nrho+1)+m)*EAM_COEFF_LENGTH];
_fp[i] = (coeff[0]*p + coeff[1])*p + coeff[2];
if (eflag||eflag_atom) {
sharedmem[threadIdx.x] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
}
}
__syncthreads();
if(eflag||eflag_atom)
{
if(i<_nlocal&&eflag_atom)
_eatom[i]+=sharedmem[threadIdx.x];
reduceBlock(sharedmem);
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(2.0)*sharedmem[0];
}
}
__global__ void PairEAMCuda_Kernel2(int eflag, int vflag,int eflag_atom,int vflag_atom)
{
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
ENERGY_FLOAT* sharedE;
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
if(eflag||eflag_atom)
{
sharedE = &sharedmem[threadIdx.x];
sharedE[0] = ENERGY_F(0.0);
sharedV += blockDim.x;
}
if(vflag||vflag_atom)
{
sharedV[0*blockDim.x] = ENERGY_F(0.0);
sharedV[1*blockDim.x] = ENERGY_F(0.0);
sharedV[2*blockDim.x] = ENERGY_F(0.0);
sharedV[3*blockDim.x] = ENERGY_F(0.0);
sharedV[4*blockDim.x] = ENERGY_F(0.0);
sharedV[5*blockDim.x] = ENERGY_F(0.0);
}
int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
X_FLOAT xtmp,ytmp,ztmp;
X_FLOAT4 myxtype;
F_FLOAT fxtmp,fytmp,fztmp,fpair;
F_FLOAT delx,dely,delz;
int itype,i;
int jnum=0;
int* jlist;
if(ii < _inum)
{
i = _ilist[ii];
myxtype=fetchXType(i);
xtmp=myxtype.x;
ytmp=myxtype.y;
ztmp=myxtype.z;
itype=static_cast <int> (myxtype.w);
fxtmp = F_F(0.0);
fytmp = F_F(0.0);
fztmp = F_F(0.0);
jnum = _numneigh[i];
jlist = &_neighbors[i];
if(i<_nlocal)
_rho[i]=F_F(0.0);
}
if(ii<gridDim.x*gridDim.y) evdwl=((ENERGY_FLOAT*) _buffer)[ii];
__syncthreads();
for (int jj = 0; jj < jnum; jj++)
{
if(ii < _inum)
if(jj<jnum)
{
const int j = jlist[jj*_nlocal];
myxtype = fetchXType(j);
delx = xtmp - myxtype.x;
dely = ytmp - myxtype.y;
delz = ztmp - myxtype.z;
int jtype = static_cast <int> (myxtype.w);
const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
if (rsq < _cutsq_global)
{
F_FLOAT r = _SQRT_(rsq);
F_FLOAT p = r*_rdr + F_F(1.0);
int m = static_cast<int> (p);
m = MIN(m,_nr-1);
p -= m;
p = MIN(p,F_F(1.0));
int k=(static_cast <int> (_type2rhor[itype*_cuda_ntypes+jtype])*(_nr+1)+m)*2;
F_FLOAT4 c=fetchRhor(k);
F_FLOAT rhoip = (c.x*p + c.y)*p + c.z;
k=(static_cast <int> (_type2rhor[jtype*_cuda_ntypes+itype])*(_nr+1)+m)*2;
c=fetchRhor(k);
F_FLOAT rhojp = (c.x*p + c.y)*p + c.z;
k=(static_cast <int> (_type2z2r[itype*_cuda_ntypes+jtype])*(_nr+1)+m)*2;
c=fetchZ2r(k);
F_FLOAT z2p = (c.x*p + c.y)*p + c.z;
c=fetchZ2r(k+1);
F_FLOAT z2 = ((c.w*p + c.x)*p + c.y)*p+c.z;
F_FLOAT recip = F_F(1.0)/r;
F_FLOAT phi = z2*recip;
F_FLOAT phip = z2p*recip - phi*recip;
F_FLOAT psip = _fp[i]*rhojp + _fp[j]*rhoip + phip;
fpair = -psip*recip;
F_FLOAT dxfp,dyfp,dzfp;
fxtmp += dxfp = delx*fpair;
fytmp += dyfp = dely*fpair;
fztmp += dzfp = delz*fpair;
evdwl+=phi;
if(vflag||vflag_atom)
{
sharedV[0 * blockDim.x]+= delx*dxfp;
sharedV[1 * blockDim.x]+= dely*dyfp;
sharedV[2 * blockDim.x]+= delz*dzfp;
sharedV[3 * blockDim.x]+= delx*dyfp;
sharedV[4 * blockDim.x]+= delx*dzfp;
sharedV[5 * blockDim.x]+= dely*dzfp;
}
}
}
}
__syncthreads();
if(ii < _inum)
{
F_FLOAT* my_f;
if(_collect_forces_later)
{
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
if(eflag)
{
buffer=&buffer[1 * gridDim.x * gridDim.y];
}
if(vflag)
{
buffer=&buffer[6 * gridDim.x * gridDim.y];
}
my_f = (F_FLOAT*) buffer;
my_f += i;
*my_f = fxtmp; my_f += _nmax;
*my_f = fytmp; my_f += _nmax;
*my_f = fztmp;
}
else
{
my_f = _f + i;
*my_f += fxtmp; my_f += _nmax;
*my_f += fytmp; my_f += _nmax;
*my_f += fztmp;
}
}
__syncthreads();
if(eflag)
{
sharedE[0] = evdwl;
}
if(eflag_atom && i<_nlocal)
{
_eatom[i] += evdwl;
}
if(vflag_atom && i<_nlocal)
{
_vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
_vatom[i+_nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
_vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
_vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
_vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
_vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
}
if(vflag||eflag) PairVirialCompute_A_Kernel(eflag,vflag,0);
}
__global__ void PairEAMCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,F_FLOAT* buffer)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
int* list=sendlist+iswap*maxlistlength;
if(i<n)
{
int j=list[i];
buffer[i]=_fp[j];
}
}
__global__ void PairEAMCuda_UnpackComm_Kernel(int n,int first,F_FLOAT* buffer)
{
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
if(i<n)
{
_fp[i+first]=buffer[i];
}
}