mirror of https://github.com/lammps/lammps.git
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7141 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
1049b963d7
commit
dcbe4d90ba
|
@ -0,0 +1,140 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "pair_sw_cuda_cu.h"
|
||||
__device__ __constant__ ParamSW_Float params_sw[MANYBODY_NPAIR*MANYBODY_NPAIR*MANYBODY_NPAIR];
|
||||
|
||||
#include "pair_sw_cuda_kernel_nc.cu"
|
||||
|
||||
#include <time.h>
|
||||
|
||||
|
||||
void Cuda_PairSWCuda_Init(cuda_shared_data* sdata,ParamSW_Float* params_host,void* map_host, void* elem2param_host,int nelements_h)
|
||||
{
|
||||
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
|
||||
X_FLOAT box_size[3] =
|
||||
{
|
||||
sdata->domain.subhi[0] - sdata->domain.sublo[0],
|
||||
sdata->domain.subhi[1] - sdata->domain.sublo[1],
|
||||
sdata->domain.subhi[2] - sdata->domain.sublo[2]
|
||||
};
|
||||
|
||||
cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3);
|
||||
cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) ,&cuda_ntypes , sizeof(unsigned) );
|
||||
cudaMemcpyToSymbol(MY_CONST(virial) ,&sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(eng_vdwl) ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , sizeof(int)*3 );
|
||||
cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) );
|
||||
cudaMemcpyToSymbol("params_sw", params_host , sizeof(ParamSW_Float)*nelements_h*nelements_h*nelements_h );
|
||||
cudaMemcpyToSymbol("elem2param",elem2param_host , sizeof(int)*nelements_h*nelements_h*nelements_h );
|
||||
cudaMemcpyToSymbol("map",map_host , sizeof(int)*cuda_ntypes );
|
||||
cudaMemcpyToSymbol("nelements",&nelements_h, sizeof(int));
|
||||
}
|
||||
|
||||
void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
|
||||
{
|
||||
static int glob_ij_size=0;
|
||||
static F_FLOAT4* glob_r_ij=NULL;
|
||||
static int* glob_numneigh_red=NULL;
|
||||
static int* glob_neighbors_red=NULL;
|
||||
static int* glob_neightype_red=NULL;
|
||||
|
||||
if(glob_ij_size < sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT))
|
||||
{
|
||||
glob_ij_size = sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT);
|
||||
cudaFree(glob_r_ij);
|
||||
cudaFree(glob_numneigh_red);
|
||||
cudaFree(glob_neighbors_red);
|
||||
cudaFree(glob_neightype_red);
|
||||
cudaMalloc(&glob_r_ij,glob_ij_size*4);
|
||||
cudaMalloc(&glob_numneigh_red,sdata->atom.nall*sizeof(int));
|
||||
cudaMalloc(&glob_neighbors_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int));
|
||||
cudaMalloc(&glob_neightype_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int));
|
||||
cudaMemcpyToSymbol("_glob_numneigh_red", &glob_numneigh_red , sizeof(int*) );
|
||||
cudaMemcpyToSymbol("_glob_neighbors_red", &glob_neighbors_red , sizeof(int*) );
|
||||
cudaMemcpyToSymbol("_glob_neightype_red", &glob_neightype_red , sizeof(int*) );
|
||||
cudaMemcpyToSymbol("_glob_r_ij", &glob_r_ij , sizeof(F_FLOAT4*) );
|
||||
}
|
||||
dim3 grid,threads;
|
||||
int sharedperproc;
|
||||
|
||||
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,64);
|
||||
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
|
||||
|
||||
|
||||
|
||||
dim3 grid2;
|
||||
if(sdata->atom.nall<=256*64000){
|
||||
grid2.x = (sdata->atom.nall+255)/256;
|
||||
grid2.y = 1;
|
||||
} else {
|
||||
grid2.x = (sdata->atom.nall+256*128-1)/(256*128);
|
||||
grid2.y = 128;
|
||||
}
|
||||
grid2.z = 1;
|
||||
dim3 threads2;
|
||||
threads2.x = 256;
|
||||
threads2.y = 1;
|
||||
threads2.z = 1;
|
||||
|
||||
timespec time1,time2;
|
||||
|
||||
//pre-calculate all neighbordistances and zeta_ij
|
||||
clock_gettime(CLOCK_REALTIME,&time1);
|
||||
Pair_SW_Kernel_TpA_RIJ<<<grid2, threads2,0,streams[1]>>>();
|
||||
cudaThreadSynchronize();
|
||||
clock_gettime(CLOCK_REALTIME,&time2);
|
||||
sdata->cuda_timings.test1+=
|
||||
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
|
||||
clock_gettime(CLOCK_REALTIME,&time1);
|
||||
|
||||
//actual force calculation
|
||||
unsigned int sharedsize=(sharedperproc*sizeof(ENERGY_FLOAT)+4*sizeof(F_FLOAT))*threads.x; //extra 4 floats per thread used to reduce register pressure
|
||||
if(eflag)
|
||||
{
|
||||
if(vflag)
|
||||
Pair_SW_Kernel_TpA<1,1><<<grid, threads,sharedsize,streams[1]>>>
|
||||
(eflag_atom,vflag_atom);
|
||||
else
|
||||
Pair_SW_Kernel_TpA<1,0><<<grid, threads,sharedsize,streams[1]>>>
|
||||
(eflag_atom,vflag_atom);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(vflag)
|
||||
Pair_SW_Kernel_TpA<0,1><<<grid, threads,sharedsize,streams[1]>>>
|
||||
(eflag_atom,vflag_atom);
|
||||
else
|
||||
Pair_SW_Kernel_TpA<0,0><<<grid, threads,sharedsize,streams[1]>>>
|
||||
(eflag_atom,vflag_atom);
|
||||
}
|
||||
cudaThreadSynchronize();
|
||||
clock_gettime(CLOCK_REALTIME,&time2);
|
||||
sdata->cuda_timings.test2+=
|
||||
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
struct ParamSW_Float {
|
||||
F_FLOAT epsilon,sigma;
|
||||
F_FLOAT littlea,lambda,gamma,costheta;
|
||||
F_FLOAT biga,bigb;
|
||||
F_FLOAT powerp,powerq;
|
||||
F_FLOAT tol;
|
||||
F_FLOAT cut,cutsq;
|
||||
F_FLOAT sigma_gamma,lambda_epsilon,lambda_epsilon2;
|
||||
F_FLOAT c1,c2,c3,c4,c5,c6;
|
||||
int ielement,jelement,kelement;
|
||||
};
|
||||
|
||||
extern "C" void Cuda_PairSWCuda_Init(cuda_shared_data* sdata,ParamSW_Float* params_host,void* map_host, void* elem2param_host,int nelements_h);
|
||||
extern "C" void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
|
|
@ -0,0 +1,449 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
#define Pi F_F(3.1415926535897932384626433832795)
|
||||
#define PI Pi
|
||||
#define PI2 F_F(0.5)*Pi
|
||||
#define PI4 F_F(0.25)*Pi
|
||||
|
||||
|
||||
|
||||
__device__ void twobody(int iparam, F_FLOAT rsq, F_FLOAT &fforce,
|
||||
int eflag, ENERGY_FLOAT &eng)
|
||||
{
|
||||
F_FLOAT r,rp,rq,rainv,expsrainv;
|
||||
|
||||
r = sqrt(rsq);
|
||||
rp = pow(r,-params_sw[iparam].powerp);
|
||||
rq = pow(r,-params_sw[iparam].powerq);
|
||||
rainv = 1.0 / (r - params_sw[iparam].cut);
|
||||
expsrainv = exp(params_sw[iparam].sigma * rainv);
|
||||
fforce = (params_sw[iparam].c1*rp - params_sw[iparam].c2*rq +
|
||||
(params_sw[iparam].c3*rp -params_sw[iparam].c4*rq) * rainv*rainv*r) * expsrainv / rsq;
|
||||
if (eflag) eng += (params_sw[iparam].c5*rp - params_sw[iparam].c6*rq) * expsrainv;
|
||||
}
|
||||
|
||||
__device__ void threebody(int paramij, int paramik, int paramijk,
|
||||
F_FLOAT4& delr1,
|
||||
F_FLOAT4& delr2,
|
||||
F_FLOAT3& fj, F_FLOAT3& fk, int eflag,ENERGY_FLOAT &eng)
|
||||
{
|
||||
F_FLOAT r1,rinvsq1,rainv1,gsrainv1,gsrainvsq1,expgsrainv1;
|
||||
F_FLOAT r2,rinvsq2,rainv2,gsrainv2,gsrainvsq2,expgsrainv2;
|
||||
F_FLOAT rinv12,cs,delcs,delcssq,facexp,facrad,frad1,frad2;
|
||||
F_FLOAT facang,facang12,csfacang,csfac1,csfac2;
|
||||
|
||||
r1 = sqrt(delr1.w);
|
||||
rinvsq1 = F_F(1.0)/delr1.w;
|
||||
rainv1 = F_F(1.0)/(r1 - params_sw[paramij].cut);
|
||||
gsrainv1 = params_sw[paramij].sigma_gamma * rainv1;
|
||||
gsrainvsq1 = gsrainv1*rainv1/r1;
|
||||
expgsrainv1 = exp(gsrainv1);
|
||||
|
||||
r2 = sqrt(delr2.w);
|
||||
rinvsq2 = F_F(1.0)/delr2.w;
|
||||
rainv2 = F_F(1.0)/(r2 - params_sw[paramik].cut);
|
||||
gsrainv2 = params_sw[paramik].sigma_gamma * rainv2;
|
||||
gsrainvsq2 = gsrainv2*rainv2/r2;
|
||||
expgsrainv2 = exp(gsrainv2);
|
||||
|
||||
rinv12 = F_F(1.0)/(r1*r2);
|
||||
cs = (delr1.x*delr2.x + delr1.y*delr2.y + delr1.z*delr2.z) * rinv12;
|
||||
delcs = cs - params_sw[paramijk].costheta;
|
||||
delcssq = delcs*delcs;
|
||||
|
||||
facexp = expgsrainv1*expgsrainv2;
|
||||
|
||||
// facrad = sqrt(paramij->lambda_epsilon*paramik->lambda_epsilon) *
|
||||
// facexp*delcssq;
|
||||
|
||||
facrad = params_sw[paramijk].lambda_epsilon * facexp*delcssq;
|
||||
frad1 = facrad*gsrainvsq1;
|
||||
frad2 = facrad*gsrainvsq2;
|
||||
facang = params_sw[paramijk].lambda_epsilon2 * facexp*delcs;
|
||||
facang12 = rinv12*facang;
|
||||
csfacang = cs*facang;
|
||||
csfac1 = rinvsq1*csfacang;
|
||||
|
||||
fj.x = delr1.x*(frad1+csfac1)-delr2.x*facang12;
|
||||
fj.y = delr1.y*(frad1+csfac1)-delr2.y*facang12;
|
||||
fj.z = delr1.z*(frad1+csfac1)-delr2.z*facang12;
|
||||
|
||||
csfac2 = rinvsq2*csfacang;
|
||||
|
||||
fk.x = delr2.x*(frad2+csfac2)-delr1.x*facang12;
|
||||
fk.y = delr2.y*(frad2+csfac2)-delr1.y*facang12;
|
||||
fk.z = delr2.z*(frad2+csfac2)-delr1.z*facang12;
|
||||
|
||||
if (eflag) eng+= F_F(2.0)*facrad;
|
||||
}
|
||||
|
||||
__device__ void threebody_fj(int paramij, int paramik, int paramijk,
|
||||
F_FLOAT4& delr1,
|
||||
F_FLOAT4& delr2,
|
||||
F_FLOAT3& fj)
|
||||
{
|
||||
F_FLOAT r1,rinvsq1,rainv1,gsrainv1,gsrainvsq1,expgsrainv1;
|
||||
F_FLOAT r2,rainv2,gsrainv2,expgsrainv2;
|
||||
F_FLOAT rinv12,cs,delcs,delcssq,facexp,facrad,frad1;
|
||||
F_FLOAT facang,facang12,csfacang,csfac1;
|
||||
|
||||
r1 = sqrt(delr1.w);
|
||||
rinvsq1 = F_F(1.0)/delr1.w;
|
||||
rainv1 = F_F(1.0)/(r1 - params_sw[paramij].cut);
|
||||
gsrainv1 = params_sw[paramij].sigma_gamma * rainv1;
|
||||
gsrainvsq1 = gsrainv1*rainv1/r1;
|
||||
expgsrainv1 = exp(gsrainv1);
|
||||
|
||||
r2 = sqrt(delr2.w);
|
||||
rainv2 = F_F(1.0)/(r2 - params_sw[paramik].cut);
|
||||
gsrainv2 = params_sw[paramik].sigma_gamma * rainv2;
|
||||
expgsrainv2 = exp(gsrainv2);
|
||||
|
||||
rinv12 = F_F(1.0)/(r1*r2);
|
||||
cs = (delr1.x*delr2.x + delr1.y*delr2.y + delr1.z*delr2.z) * rinv12;
|
||||
delcs = cs - params_sw[paramijk].costheta;
|
||||
delcssq = delcs*delcs;
|
||||
|
||||
facexp = expgsrainv1*expgsrainv2;
|
||||
|
||||
// facrad = sqrt(paramij->lambda_epsilon*paramik->lambda_epsilon) *
|
||||
// facexp*delcssq;
|
||||
|
||||
facrad = params_sw[paramijk].lambda_epsilon * facexp*delcssq;
|
||||
frad1 = facrad*gsrainvsq1;
|
||||
facang = params_sw[paramijk].lambda_epsilon2 * facexp*delcs;
|
||||
facang12 = rinv12*facang;
|
||||
csfacang = cs*facang;
|
||||
csfac1 = rinvsq1*csfacang;
|
||||
|
||||
fj.x = delr1.x*(frad1+csfac1)-delr2.x*facang12;
|
||||
fj.y = delr1.y*(frad1+csfac1)-delr2.y*facang12;
|
||||
fj.z = delr1.z*(frad1+csfac1)-delr2.z*facang12;
|
||||
}
|
||||
|
||||
|
||||
__global__ void Pair_SW_Kernel_TpA_RIJ()//F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red)
|
||||
{
|
||||
int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
||||
if( ii >= _nall ) return;
|
||||
|
||||
X_FLOAT4 myxtype;
|
||||
F_FLOAT4 delij;
|
||||
F_FLOAT xtmp,ytmp,ztmp;
|
||||
int itype,jnum,i,j;
|
||||
int* jlist;
|
||||
int neigh_red = 0;
|
||||
i = ii;//_ilist[ii];
|
||||
myxtype = fetchXType(i);
|
||||
|
||||
xtmp=myxtype.x;
|
||||
ytmp=myxtype.y;
|
||||
ztmp=myxtype.z;
|
||||
itype=map[(static_cast <int> (myxtype.w))];
|
||||
|
||||
jnum = _numneigh[i];
|
||||
jlist = &_neighbors[i];
|
||||
|
||||
__syncthreads();
|
||||
for (int jj = 0; jj < jnum; jj++)
|
||||
{
|
||||
if(jj<jnum)
|
||||
{
|
||||
|
||||
j = jlist[jj*_nall];
|
||||
j &= NEIGHMASK;
|
||||
myxtype = fetchXType(j);
|
||||
delij.x = xtmp - myxtype.x;
|
||||
delij.y = ytmp - myxtype.y;
|
||||
delij.z = ztmp - myxtype.z;
|
||||
int jtype = map[(static_cast <int> (myxtype.w))];
|
||||
int iparam_ij = elem2param[(itype*nelements+jtype)*nelements+jtype];
|
||||
delij.w = vec3_dot(delij,delij);
|
||||
if (delij.w < params_sw[iparam_ij].cutsq)
|
||||
{
|
||||
_glob_neighbors_red[i+neigh_red*_nall]=j;
|
||||
_glob_neightype_red[i+neigh_red*_nall]=jtype;
|
||||
_glob_r_ij[i+neigh_red*_nall]=delij;
|
||||
neigh_red++;
|
||||
}
|
||||
}
|
||||
}
|
||||
_glob_numneigh_red[i]=neigh_red;
|
||||
}
|
||||
|
||||
|
||||
template <int eflag, int vflagm>
|
||||
__global__ void Pair_SW_Kernel_TpA(int eflag_atom,int vflag_atom)//,F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red)
|
||||
{
|
||||
ENERGY_FLOAT evdwl = ENERGY_F(0.0);
|
||||
|
||||
ENERGY_FLOAT* sharedE = &sharedmem[threadIdx.x];
|
||||
ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
|
||||
|
||||
F_FLOAT* shared_F_F = (F_FLOAT*) sharedmem;
|
||||
if((eflag||eflag_atom)&&(vflagm||vflag_atom)) shared_F_F = (F_FLOAT*) &sharedmem[7*blockDim.x];
|
||||
else
|
||||
if(eflag) shared_F_F = (F_FLOAT*) &sharedmem[blockDim.x];
|
||||
else
|
||||
if(vflagm) shared_F_F = (F_FLOAT*) &sharedmem[6*blockDim.x];
|
||||
shared_F_F+=threadIdx.x;
|
||||
|
||||
if(eflag_atom||eflag)
|
||||
{
|
||||
sharedE[0] = ENERGY_F(0.0);
|
||||
sharedV += blockDim.x;
|
||||
}
|
||||
|
||||
if(vflagm||vflag_atom)
|
||||
{
|
||||
sharedV[0*blockDim.x] = ENERGY_F(0.0);
|
||||
sharedV[1*blockDim.x] = ENERGY_F(0.0);
|
||||
sharedV[2*blockDim.x] = ENERGY_F(0.0);
|
||||
sharedV[3*blockDim.x] = ENERGY_F(0.0);
|
||||
sharedV[4*blockDim.x] = ENERGY_F(0.0);
|
||||
sharedV[5*blockDim.x] = ENERGY_F(0.0);
|
||||
}
|
||||
|
||||
int jnum_red=0;
|
||||
#define fxtmp shared_F_F[0]
|
||||
#define fytmp shared_F_F[blockDim.x]
|
||||
#define fztmp shared_F_F[2*blockDim.x]
|
||||
//#define jnum_red (static_cast <int> (shared_F_F[3*blockDim.x]))
|
||||
|
||||
int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
||||
X_FLOAT4 myxtype_i,myxtype_j,myxtype_k;
|
||||
F_FLOAT4 delij,delik,deljk;
|
||||
F_FLOAT fpair;
|
||||
|
||||
int itype,i,j;
|
||||
int* jlist_red;
|
||||
|
||||
if(ii < _inum)
|
||||
{
|
||||
i = _ilist[ii];
|
||||
|
||||
if(vflagm)
|
||||
myxtype_i=fetchXType(i);
|
||||
//itype=map[(static_cast <int> (myxtype_i.w))];
|
||||
itype=map[_type[i]];
|
||||
|
||||
|
||||
fxtmp = F_F(0.0);
|
||||
fytmp = F_F(0.0);
|
||||
fztmp = F_F(0.0);
|
||||
|
||||
|
||||
//shared_F_F[3*blockDim.x] = _glob_numneigh_red[i];
|
||||
jnum_red = _glob_numneigh_red[i];
|
||||
jlist_red = &_glob_neighbors_red[i];
|
||||
}
|
||||
__syncthreads();
|
||||
#pragma unroll 1
|
||||
for (int jj = 0; jj < jnum_red; jj++)
|
||||
{
|
||||
if(i < _nlocal)
|
||||
{
|
||||
fpair=F_F(0.0);
|
||||
j = jlist_red[jj*_nall];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
if(vflagm)
|
||||
myxtype_j = fetchXType(j);
|
||||
|
||||
int jtype = _glob_neightype_red[i+jj*_nall];
|
||||
delij = _glob_r_ij[i+jj*_nall];
|
||||
|
||||
volatile int iparam_ij = elem2param[(itype*nelements+jtype)*nelements+jtype];
|
||||
volatile int iparam_ji = elem2param[(jtype*nelements+itype)*nelements+itype];
|
||||
|
||||
if (delij.w<params_sw[iparam_ij].cutsq)
|
||||
{
|
||||
F_FLOAT dxfp,dyfp,dzfp;
|
||||
twobody(iparam_ij,delij.w,fpair,eflag,evdwl);
|
||||
fxtmp += dxfp = delij.x*fpair;
|
||||
fytmp += dyfp = delij.y*fpair;
|
||||
fztmp += dzfp = delij.z*fpair;
|
||||
if(vflagm)
|
||||
{
|
||||
sharedV[0 * blockDim.x]+= delij.x*dxfp;
|
||||
sharedV[1 * blockDim.x]+= delij.y*dyfp;
|
||||
sharedV[2 * blockDim.x]+= delij.z*dzfp;
|
||||
sharedV[3 * blockDim.x]+= delij.x*dyfp;
|
||||
sharedV[4 * blockDim.x]+= delij.x*dzfp;
|
||||
sharedV[5 * blockDim.x]+= delij.y*dzfp;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
vec3_scale(F_F(-1.0),delij,delij);
|
||||
|
||||
#pragma unroll 1
|
||||
for (int kk = jj+1; kk < jnum_red; kk++) {
|
||||
int k = jlist_red[kk*_nall];
|
||||
k &= NEIGHMASK;
|
||||
|
||||
if(vflagm)
|
||||
myxtype_k = fetchXType(k);
|
||||
|
||||
delik = _glob_r_ij[i+kk*_nall];
|
||||
|
||||
int ktype = _glob_neightype_red[i+kk*_nall];
|
||||
int iparam_ik = elem2param[(itype*nelements+ktype)*nelements+ktype];
|
||||
int iparam_ijk = elem2param[(itype*nelements+jtype)*nelements+ktype];
|
||||
vec3_scale(F_F(-1.0),delik,delik);
|
||||
if (delik.w <= params_sw[iparam_ijk].cutsq)
|
||||
{
|
||||
F_FLOAT3 fj,fk;
|
||||
threebody(iparam_ij,iparam_ik,iparam_ijk,
|
||||
delij,delik,fj,fk,eflag,evdwl);
|
||||
fxtmp -= fj.x + fk.x;
|
||||
fytmp -= fj.y + fk.y;
|
||||
fztmp -= fj.z + fk.z;
|
||||
|
||||
if(vflagm)
|
||||
{
|
||||
sharedV[0 * blockDim.x]-= ENERGY_F(2.0)*myxtype_i.x*(fj.x+fk.x);
|
||||
sharedV[1 * blockDim.x]-= ENERGY_F(2.0)*myxtype_i.y*(fj.y+fk.y);
|
||||
sharedV[2 * blockDim.x]-= ENERGY_F(2.0)*myxtype_i.z*(fj.z+fk.z);
|
||||
sharedV[3 * blockDim.x]-= ENERGY_F(2.0)*myxtype_i.x*(fj.y+fk.y);
|
||||
sharedV[4 * blockDim.x]-= ENERGY_F(2.0)*myxtype_i.x*(fj.z+fk.z);
|
||||
sharedV[5 * blockDim.x]-= ENERGY_F(2.0)*myxtype_i.y*(fj.z+fk.z);
|
||||
|
||||
sharedV[0 * blockDim.x]+= ENERGY_F(2.0)*myxtype_j.x*fj.x;
|
||||
sharedV[1 * blockDim.x]+= ENERGY_F(2.0)*myxtype_j.y*fj.y;
|
||||
sharedV[2 * blockDim.x]+= ENERGY_F(2.0)*myxtype_j.z*fj.z;
|
||||
sharedV[3 * blockDim.x]+= ENERGY_F(2.0)*myxtype_j.x*fj.y;
|
||||
sharedV[4 * blockDim.x]+= ENERGY_F(2.0)*myxtype_j.x*fj.z;
|
||||
sharedV[5 * blockDim.x]+= ENERGY_F(2.0)*myxtype_j.y*fj.z;
|
||||
|
||||
sharedV[0 * blockDim.x]+= ENERGY_F(2.0)*myxtype_k.x*fk.x;
|
||||
sharedV[1 * blockDim.x]+= ENERGY_F(2.0)*myxtype_k.y*fk.y;
|
||||
sharedV[2 * blockDim.x]+= ENERGY_F(2.0)*myxtype_k.z*fk.z;
|
||||
sharedV[3 * blockDim.x]+= ENERGY_F(2.0)*myxtype_k.x*fk.y;
|
||||
sharedV[4 * blockDim.x]+= ENERGY_F(2.0)*myxtype_k.x*fk.z;
|
||||
sharedV[5 * blockDim.x]+= ENERGY_F(2.0)*myxtype_k.y*fk.z;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int j_jnum_red = _glob_numneigh_red[j];
|
||||
int* j_jlist_red = &_glob_neighbors_red[j];
|
||||
|
||||
int j_ii = 0;
|
||||
//#pragma unroll 1
|
||||
for (int j_kk = 0; j_kk < j_jnum_red; j_kk++) {
|
||||
if(j_jlist_red[j_kk*_nall]==i) j_ii = j_kk;
|
||||
}
|
||||
|
||||
#pragma unroll 1
|
||||
for (int kk = 0; kk < j_jnum_red; kk++)
|
||||
{
|
||||
if (j_ii == kk) continue;
|
||||
int k = j_jlist_red[kk*_nall];
|
||||
k &= NEIGHMASK;
|
||||
deljk = _glob_r_ij[j+kk*_nall];
|
||||
vec3_scale(F_F(-1.0),deljk,deljk);
|
||||
int ktype = _glob_neightype_red[j+kk*_nall];
|
||||
|
||||
int iparam_ji = elem2param[(jtype*nelements+itype)*nelements+itype];
|
||||
int iparam_jk = elem2param[(jtype*nelements+ktype)*nelements+ktype];
|
||||
int iparam_jik = elem2param[(jtype*nelements+itype)*nelements+ktype];
|
||||
|
||||
|
||||
vec3_scale(F_F(-1.0),delij,delij);
|
||||
if (deljk.w <= params_sw[iparam_jik].cutsq)
|
||||
{
|
||||
F_FLOAT3 fj;
|
||||
|
||||
threebody_fj(iparam_ji,iparam_jk,iparam_jik,
|
||||
delij,deljk,fj);
|
||||
fxtmp += fj.x;
|
||||
fytmp += fj.y;
|
||||
fztmp += fj.z;
|
||||
|
||||
}
|
||||
vec3_scale(F_F(-1.0),delij,delij);
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
}
|
||||
__syncthreads();
|
||||
if(ii < _inum)
|
||||
{
|
||||
F_FLOAT* my_f;
|
||||
if(_collect_forces_later)
|
||||
{
|
||||
ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
|
||||
if(eflag)
|
||||
{
|
||||
buffer=&buffer[1 * gridDim.x * gridDim.y];
|
||||
}
|
||||
if(vflagm)
|
||||
{
|
||||
buffer=&buffer[6 * gridDim.x * gridDim.y];
|
||||
}
|
||||
my_f = (F_FLOAT*) buffer;
|
||||
my_f += i;
|
||||
*my_f = fxtmp; my_f += _nmax;
|
||||
*my_f = fytmp; my_f += _nmax;
|
||||
*my_f = fztmp;
|
||||
}
|
||||
else
|
||||
{
|
||||
my_f = _f + i;
|
||||
*my_f += fxtmp; my_f += _nmax;
|
||||
*my_f += fytmp; my_f += _nmax;
|
||||
*my_f += fztmp;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if(eflag)
|
||||
{
|
||||
sharedE[0] = evdwl;
|
||||
}
|
||||
if(eflag_atom && i<_nlocal)
|
||||
{
|
||||
_eatom[i] += evdwl;
|
||||
}
|
||||
|
||||
if(vflag_atom && i<_nlocal)
|
||||
{
|
||||
_vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
|
||||
_vatom[i+_nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
|
||||
_vatom[i+2*_nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
|
||||
_vatom[i+3*_nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
|
||||
_vatom[i+4*_nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
|
||||
_vatom[i+5*_nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
|
||||
}
|
||||
if(vflagm||eflag) PairVirialCompute_A_Kernel_Template<1,1>();
|
||||
#undef fxtmp
|
||||
#undef fytmp
|
||||
#undef fztmp
|
||||
//#undef jnum_red
|
||||
}
|
|
@ -0,0 +1,155 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
#include "pair_tersoff_cuda_cu.h"
|
||||
__device__ __constant__ Param_Float params[MANYBODY_NPAIR*MANYBODY_NPAIR*MANYBODY_NPAIR];
|
||||
__device__ __constant__ F_FLOAT* _glob_zeta_ij; //zeta_ij
|
||||
__device__ __constant__ F_FLOAT4* _glob_r_ij; //r_ij (x,y,z,r^2) for pairs within force cutoff
|
||||
__device__ __constant__ bool _zbl; //is tersoff zbl?
|
||||
|
||||
|
||||
#include "pair_tersoff_cuda_kernel_nc.cu"
|
||||
|
||||
#include <time.h>
|
||||
|
||||
|
||||
void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata,Param_Float* params_host,void* map_host, void* elem2param_host,int nelements_h,bool zbl)
|
||||
{
|
||||
unsigned cuda_ntypes = sdata->atom.ntypes + 1;
|
||||
X_FLOAT box_size[3] =
|
||||
{
|
||||
sdata->domain.subhi[0] - sdata->domain.sublo[0],
|
||||
sdata->domain.subhi[1] - sdata->domain.sublo[1],
|
||||
sdata->domain.subhi[2] - sdata->domain.sublo[2]
|
||||
};
|
||||
|
||||
cudaMemcpyToSymbol(MY_CONST(box_size) , box_size , sizeof(X_FLOAT)*3);
|
||||
cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) ,&cuda_ntypes , sizeof(unsigned) );
|
||||
cudaMemcpyToSymbol(MY_CONST(virial) ,&sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(eng_vdwl) ,&sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*) );
|
||||
cudaMemcpyToSymbol(MY_CONST(periodicity) , sdata->domain.periodicity , sizeof(int)*3 );
|
||||
cudaMemcpyToSymbol(MY_CONST(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int) );
|
||||
cudaMemcpyToSymbol("params", params_host , sizeof(Param_Float)*nelements_h*nelements_h*nelements_h );
|
||||
cudaMemcpyToSymbol("elem2param",elem2param_host , sizeof(int)*nelements_h*nelements_h*nelements_h );
|
||||
cudaMemcpyToSymbol("map",map_host , sizeof(int)*cuda_ntypes );
|
||||
cudaMemcpyToSymbol("nelements",&nelements_h, sizeof(int));
|
||||
cudaMemcpyToSymbol("_zbl",&zbl,sizeof(bool));
|
||||
|
||||
}
|
||||
|
||||
void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
|
||||
{
|
||||
static F_FLOAT* glob_zeta_ij=NULL;
|
||||
static int glob_zeta_ij_size=0;
|
||||
static F_FLOAT4* glob_r_ij=NULL;
|
||||
static int* glob_numneigh_red=NULL;
|
||||
static int* glob_neighbors_red=NULL;
|
||||
static int* glob_neightype_red=NULL;
|
||||
|
||||
if(glob_zeta_ij_size < sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT))
|
||||
{
|
||||
glob_zeta_ij_size = sdata->atom.nall*sneighlist->maxneighbors*sizeof(F_FLOAT);
|
||||
cudaFree(glob_zeta_ij);
|
||||
cudaFree(glob_r_ij);
|
||||
cudaFree(glob_numneigh_red);
|
||||
cudaFree(glob_neighbors_red);
|
||||
cudaFree(glob_neightype_red);
|
||||
cudaMalloc(&glob_zeta_ij,glob_zeta_ij_size);
|
||||
cudaMalloc(&glob_r_ij,glob_zeta_ij_size*4);
|
||||
cudaMalloc(&glob_numneigh_red,sdata->atom.nall*sizeof(int));
|
||||
cudaMalloc(&glob_neighbors_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int));
|
||||
cudaMalloc(&glob_neightype_red,sdata->atom.nall*sneighlist->maxneighbors*sizeof(int));
|
||||
cudaMemcpyToSymbol("_glob_numneigh_red", &glob_numneigh_red , sizeof(int*) );
|
||||
cudaMemcpyToSymbol("_glob_neighbors_red", &glob_neighbors_red , sizeof(int*) );
|
||||
cudaMemcpyToSymbol("_glob_neightype_red", &glob_neightype_red , sizeof(int*) );
|
||||
cudaMemcpyToSymbol("_glob_r_ij", &glob_r_ij , sizeof(F_FLOAT4*) );
|
||||
cudaMemcpyToSymbol("_glob_zeta_ij", &glob_zeta_ij , sizeof(F_FLOAT*) );
|
||||
}
|
||||
dim3 grid,threads;
|
||||
int sharedperproc;
|
||||
|
||||
Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,false,64);
|
||||
cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
|
||||
|
||||
|
||||
|
||||
dim3 grid2;
|
||||
if(sdata->atom.nall<=256*64000){
|
||||
grid2.x = (sdata->atom.nall+255)/256;
|
||||
grid2.y = 1;
|
||||
} else {
|
||||
grid2.x = (sdata->atom.nall+256*128-1)/(256*128);
|
||||
grid2.y = 128;
|
||||
}
|
||||
grid2.z = 1;
|
||||
dim3 threads2;
|
||||
threads2.x = 256;
|
||||
threads2.y = 1;
|
||||
threads2.z = 1;
|
||||
|
||||
timespec time1,time2;
|
||||
|
||||
//pre-calculate all neighbordistances and zeta_ij
|
||||
clock_gettime(CLOCK_REALTIME,&time1);
|
||||
Pair_Tersoff_Kernel_TpA_RIJ<<<grid2, threads2,0,streams[1]>>>
|
||||
();
|
||||
cudaThreadSynchronize();
|
||||
Pair_Tersoff_Kernel_TpA_ZetaIJ<<<grid2, threads2,0,streams[1]>>>
|
||||
();
|
||||
cudaThreadSynchronize();
|
||||
clock_gettime(CLOCK_REALTIME,&time2);
|
||||
sdata->cuda_timings.test1+=
|
||||
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
|
||||
clock_gettime(CLOCK_REALTIME,&time1);
|
||||
|
||||
//actual force calculation
|
||||
unsigned int sharedsize=(sharedperproc*sizeof(ENERGY_FLOAT)+4*sizeof(F_FLOAT))*threads.x; //extra 4 floats per thread used to reduce register pressure
|
||||
if(eflag)
|
||||
{
|
||||
if(vflag)
|
||||
Pair_Tersoff_Kernel_TpA<1,1><<<grid, threads,sharedsize,streams[1]>>>
|
||||
(eflag_atom,vflag_atom);
|
||||
else
|
||||
Pair_Tersoff_Kernel_TpA<1,0><<<grid, threads,sharedsize,streams[1]>>>
|
||||
(eflag_atom,vflag_atom);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(vflag)
|
||||
Pair_Tersoff_Kernel_TpA<0,1><<<grid, threads,sharedsize,streams[1]>>>
|
||||
(eflag_atom,vflag_atom);
|
||||
else
|
||||
Pair_Tersoff_Kernel_TpA<0,0><<<grid, threads,sharedsize,streams[1]>>>
|
||||
(eflag_atom,vflag_atom);
|
||||
}
|
||||
cudaThreadSynchronize();
|
||||
clock_gettime(CLOCK_REALTIME,&time2);
|
||||
sdata->cuda_timings.test2+=
|
||||
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
|
||||
|
||||
Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
|
||||
}
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
|
||||
Original Version:
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
|
||||
USER-CUDA Package and associated modifications:
|
||||
https://sourceforge.net/projects/lammpscuda/
|
||||
|
||||
Christian Trott, christian.trott@tu-ilmenau.de
|
||||
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
||||
Theoretical Physics II, University of Technology Ilmenau, Germany
|
||||
|
||||
See the README file in the USER-CUDA directory.
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "cuda_shared.h"
|
||||
|
||||
struct Param_Float {
|
||||
F_FLOAT lam1,lam2,lam3;
|
||||
F_FLOAT c,d,h;
|
||||
F_FLOAT gamma,powerm;
|
||||
F_FLOAT powern,beta;
|
||||
F_FLOAT biga,bigb,bigd,bigr;
|
||||
F_FLOAT cut,cutsq;
|
||||
F_FLOAT c1,c2,c3,c4;
|
||||
int ielement,jelement,kelement;
|
||||
int powermint;
|
||||
//F_FLOAT Z_i,Z_j;
|
||||
F_FLOAT ZBLcut,ZBLexpscale;
|
||||
F_FLOAT a_ij,premult;
|
||||
};
|
||||
|
||||
extern "C" void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata,Param_Float* params_host,void* map_host, void* elem2param_host,int nelements_h,bool zbl);
|
||||
extern "C" void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue