mirror of https://github.com/lammps/lammps.git
341 lines
8.4 KiB
Plaintext
341 lines
8.4 KiB
Plaintext
/* ----------------------------------------------------------------------
|
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
|
|
Original Version:
|
|
http://lammps.sandia.gov, Sandia National Laboratories
|
|
Steve Plimpton, sjplimp@sandia.gov
|
|
|
|
See the README file in the top-level LAMMPS directory.
|
|
|
|
-----------------------------------------------------------------------
|
|
|
|
USER-CUDA Package and associated modifications:
|
|
https://sourceforge.net/projects/lammpscuda/
|
|
|
|
Christian Trott, christian.trott@tu-ilmenau.de
|
|
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
|
Theoretical Physics II, University of Technology Ilmenau, Germany
|
|
|
|
See the README file in the USER-CUDA directory.
|
|
|
|
This software is distributed under the GNU General Public License.
|
|
------------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
|
|
static __device__ inline F_FLOAT4 fetchRhor(int i)
|
|
{
|
|
#ifdef CUDA_USE_TEXTURE
|
|
#if F_PRECISION == 1
|
|
return tex1Dfetch(_rhor_spline_tex,i);
|
|
#else
|
|
return tex1Dfetch_double_f(_rhor_spline_tex,i);
|
|
#endif
|
|
#else
|
|
return _rhor_spline[i];
|
|
#endif
|
|
}
|
|
|
|
static __device__ inline F_FLOAT4 fetchZ2r(int i)
|
|
{
|
|
#ifdef CUDA_USE_TEXTURE
|
|
#if F_PRECISION == 1
|
|
return tex1Dfetch(_z2r_spline_tex,i);
|
|
#else
|
|
return tex1Dfetch_double_f(_z2r_spline_tex,i);
|
|
#endif
|
|
#else
|
|
return _z2r_spline[i];
|
|
#endif
|
|
}
|
|
|
|
__global__ void PairEAMCuda_Kernel1(int eflag, int vflag,int eflag_atom,int vflag_atom)
|
|
{
|
|
ENERGY_FLOAT* sharedE;
|
|
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
|
|
|
|
|
|
if(eflag||eflag_atom)
|
|
{
|
|
sharedE = &sharedmem[threadIdx.x];
|
|
sharedE[0] = ENERGY_F(0.0);
|
|
sharedV += blockDim.x;
|
|
}
|
|
if(vflag||vflag_atom)
|
|
{
|
|
sharedV[0*blockDim.x] = ENERGY_F(0.0);
|
|
sharedV[1*blockDim.x] = ENERGY_F(0.0);
|
|
sharedV[2*blockDim.x] = ENERGY_F(0.0);
|
|
sharedV[3*blockDim.x] = ENERGY_F(0.0);
|
|
sharedV[4*blockDim.x] = ENERGY_F(0.0);
|
|
sharedV[5*blockDim.x] = ENERGY_F(0.0);
|
|
}
|
|
|
|
int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
|
|
X_FLOAT xtmp,ytmp,ztmp;
|
|
X_FLOAT4 myxtype;
|
|
F_FLOAT delx,dely,delz;
|
|
int itype;
|
|
int i=_nlocal;
|
|
int jnum=0;
|
|
int* jlist;
|
|
|
|
if(ii < _inum)
|
|
{
|
|
i = _ilist[ii];
|
|
|
|
myxtype=fetchXType(i);
|
|
xtmp=myxtype.x;
|
|
ytmp=myxtype.y;
|
|
ztmp=myxtype.z;
|
|
itype=static_cast <int> (myxtype.w);
|
|
|
|
jnum = _numneigh[i];
|
|
|
|
jlist = &_neighbors[i];
|
|
if(i<_nlocal)
|
|
_rho[i]=F_F(0.0);
|
|
}
|
|
__syncthreads();
|
|
|
|
for (int jj = 0; jj < jnum; jj++)
|
|
{
|
|
if(ii < _inum)
|
|
if(jj<jnum)
|
|
{
|
|
const int j = jlist[jj*_nlocal];
|
|
myxtype = fetchXType(j);
|
|
delx = xtmp - myxtype.x;
|
|
dely = ytmp - myxtype.y;
|
|
delz = ztmp - myxtype.z;
|
|
int jtype = static_cast <int> (myxtype.w);
|
|
const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
|
|
if (rsq < _cutsq_global)
|
|
{
|
|
F_FLOAT p = sqrt(rsq)*_rdr + F_F(1.0);
|
|
int m = static_cast<int> (p);
|
|
m = MIN(m,_nr-1);
|
|
p -= m;
|
|
p = MIN(p,F_F(1.0));
|
|
|
|
int k=(static_cast <int> (_type2rhor[jtype*_cuda_ntypes+itype])*(_nr+1)+m)*2;
|
|
F_FLOAT4 c=fetchRhor(k+1);
|
|
_rho[i] += ((c.w*p+c.x)*p+c.y)*p+c.z;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(ii < _inum)
|
|
{
|
|
|
|
F_FLOAT p = _rho[i]*_rdrho + F_F(1.0);
|
|
int m = static_cast<int> (p);
|
|
m = MAX(1,MIN(m,_nrho-1));
|
|
p -= m;
|
|
p = MIN(p,F_F(1.0));
|
|
F_FLOAT* coeff = &_frho_spline[(static_cast <int> (_type2frho[itype])*(_nrho+1)+m)*EAM_COEFF_LENGTH];
|
|
_fp[i] = (coeff[0]*p + coeff[1])*p + coeff[2];
|
|
if (eflag||eflag_atom) {
|
|
sharedmem[threadIdx.x] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
|
|
}
|
|
|
|
}
|
|
__syncthreads();
|
|
if(eflag||eflag_atom)
|
|
{
|
|
if(i<_nlocal&&eflag_atom)
|
|
_eatom[i]+=sharedmem[threadIdx.x];
|
|
reduceBlock(sharedmem);
|
|
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
|
buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(2.0)*sharedmem[0];
|
|
}
|
|
}
|
|
|
|
__global__ void PairEAMCuda_Kernel2(int eflag, int vflag,int eflag_atom,int vflag_atom)
|
|
{
|
|
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
|
|
|
|
ENERGY_FLOAT* sharedE;
|
|
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
|
|
|
|
|
|
if(eflag||eflag_atom)
|
|
{
|
|
sharedE = &sharedmem[threadIdx.x];
|
|
sharedE[0] = ENERGY_F(0.0);
|
|
sharedV += blockDim.x;
|
|
}
|
|
if(vflag||vflag_atom)
|
|
{
|
|
sharedV[0*blockDim.x] = ENERGY_F(0.0);
|
|
sharedV[1*blockDim.x] = ENERGY_F(0.0);
|
|
sharedV[2*blockDim.x] = ENERGY_F(0.0);
|
|
sharedV[3*blockDim.x] = ENERGY_F(0.0);
|
|
sharedV[4*blockDim.x] = ENERGY_F(0.0);
|
|
sharedV[5*blockDim.x] = ENERGY_F(0.0);
|
|
}
|
|
|
|
int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
|
|
X_FLOAT xtmp,ytmp,ztmp;
|
|
X_FLOAT4 myxtype;
|
|
F_FLOAT fxtmp,fytmp,fztmp,fpair;
|
|
F_FLOAT delx,dely,delz;
|
|
int itype,i;
|
|
int jnum=0;
|
|
int* jlist;
|
|
|
|
if(ii < _inum)
|
|
{
|
|
i = _ilist[ii];
|
|
|
|
myxtype=fetchXType(i);
|
|
xtmp=myxtype.x;
|
|
ytmp=myxtype.y;
|
|
ztmp=myxtype.z;
|
|
itype=static_cast <int> (myxtype.w);
|
|
fxtmp = F_F(0.0);
|
|
fytmp = F_F(0.0);
|
|
fztmp = F_F(0.0);
|
|
|
|
jnum = _numneigh[i];
|
|
|
|
jlist = &_neighbors[i];
|
|
if(i<_nlocal)
|
|
_rho[i]=F_F(0.0);
|
|
}
|
|
if(ii<gridDim.x*gridDim.y) evdwl=((ENERGY_FLOAT*) _buffer)[ii];
|
|
__syncthreads();
|
|
|
|
for (int jj = 0; jj < jnum; jj++)
|
|
{
|
|
if(ii < _inum)
|
|
if(jj<jnum)
|
|
{
|
|
const int j = jlist[jj*_nlocal];
|
|
myxtype = fetchXType(j);
|
|
delx = xtmp - myxtype.x;
|
|
dely = ytmp - myxtype.y;
|
|
delz = ztmp - myxtype.z;
|
|
int jtype = static_cast <int> (myxtype.w);
|
|
const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
|
|
|
|
if (rsq < _cutsq_global)
|
|
{
|
|
F_FLOAT r = _SQRT_(rsq);
|
|
F_FLOAT p = r*_rdr + F_F(1.0);
|
|
int m = static_cast<int> (p);
|
|
m = MIN(m,_nr-1);
|
|
p -= m;
|
|
p = MIN(p,F_F(1.0));
|
|
|
|
int k=(static_cast <int> (_type2rhor[itype*_cuda_ntypes+jtype])*(_nr+1)+m)*2;
|
|
F_FLOAT4 c=fetchRhor(k);
|
|
F_FLOAT rhoip = (c.x*p + c.y)*p + c.z;
|
|
k=(static_cast <int> (_type2rhor[jtype*_cuda_ntypes+itype])*(_nr+1)+m)*2;
|
|
c=fetchRhor(k);
|
|
F_FLOAT rhojp = (c.x*p + c.y)*p + c.z;
|
|
k=(static_cast <int> (_type2z2r[itype*_cuda_ntypes+jtype])*(_nr+1)+m)*2;
|
|
c=fetchZ2r(k);
|
|
F_FLOAT z2p = (c.x*p + c.y)*p + c.z;
|
|
c=fetchZ2r(k+1);
|
|
F_FLOAT z2 = ((c.w*p + c.x)*p + c.y)*p+c.z;
|
|
|
|
F_FLOAT recip = F_F(1.0)/r;
|
|
F_FLOAT phi = z2*recip;
|
|
F_FLOAT phip = z2p*recip - phi*recip;
|
|
F_FLOAT psip = _fp[i]*rhojp + _fp[j]*rhoip + phip;
|
|
fpair = -psip*recip;
|
|
|
|
F_FLOAT dxfp,dyfp,dzfp;
|
|
fxtmp += dxfp = delx*fpair;
|
|
fytmp += dyfp = dely*fpair;
|
|
fztmp += dzfp = delz*fpair;
|
|
evdwl+=phi;
|
|
if(vflag||vflag_atom)
|
|
{
|
|
sharedV[0 * blockDim.x]+= delx*dxfp;
|
|
sharedV[1 * blockDim.x]+= dely*dyfp;
|
|
sharedV[2 * blockDim.x]+= delz*dzfp;
|
|
sharedV[3 * blockDim.x]+= delx*dyfp;
|
|
sharedV[4 * blockDim.x]+= delx*dzfp;
|
|
sharedV[5 * blockDim.x]+= dely*dzfp;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
__syncthreads();
|
|
if(ii < _inum)
|
|
{
|
|
F_FLOAT* my_f;
|
|
if(_collect_forces_later)
|
|
{
|
|
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
|
if(eflag)
|
|
{
|
|
buffer=&buffer[1 * gridDim.x * gridDim.y];
|
|
}
|
|
if(vflag)
|
|
{
|
|
buffer=&buffer[6 * gridDim.x * gridDim.y];
|
|
}
|
|
my_f = (F_FLOAT*) buffer;
|
|
my_f += i;
|
|
*my_f = fxtmp; my_f += _nmax;
|
|
*my_f = fytmp; my_f += _nmax;
|
|
*my_f = fztmp;
|
|
}
|
|
else
|
|
{
|
|
my_f = _f + i;
|
|
*my_f += fxtmp; my_f += _nmax;
|
|
*my_f += fytmp; my_f += _nmax;
|
|
*my_f += fztmp;
|
|
}
|
|
}
|
|
__syncthreads();
|
|
|
|
if(eflag)
|
|
{
|
|
sharedE[0] = evdwl;
|
|
}
|
|
if(eflag_atom && i<_nlocal)
|
|
{
|
|
_eatom[i] += evdwl;
|
|
}
|
|
|
|
if(vflag_atom && i<_nlocal)
|
|
{
|
|
_vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
|
|
_vatom[i+_nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
|
|
_vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
|
|
_vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
|
|
_vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
|
|
_vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
|
|
}
|
|
if(vflag||eflag) PairVirialCompute_A_Kernel(eflag,vflag,0);
|
|
}
|
|
|
|
__global__ void PairEAMCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,F_FLOAT* buffer)
|
|
{
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
int* list=sendlist+iswap*maxlistlength;
|
|
if(i<n)
|
|
{
|
|
int j=list[i];
|
|
buffer[i]=_fp[j];
|
|
}
|
|
}
|
|
|
|
__global__ void PairEAMCuda_UnpackComm_Kernel(int n,int first,F_FLOAT* buffer)
|
|
{
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
if(i<n)
|
|
{
|
|
_fp[i+first]=buffer[i];
|
|
}
|
|
}
|