forked from lijiext/lammps
354 lines
9.2 KiB
Plaintext
354 lines
9.2 KiB
Plaintext
/* ----------------------------------------------------------------------
|
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
|
|
Original Version:
|
|
http://lammps.sandia.gov, Sandia National Laboratories
|
|
Steve Plimpton, sjplimp@sandia.gov
|
|
|
|
See the README file in the top-level LAMMPS directory.
|
|
|
|
-----------------------------------------------------------------------
|
|
|
|
USER-CUDA Package and associated modifications:
|
|
https://sourceforge.net/projects/lammpscuda/
|
|
|
|
Christian Trott, christian.trott@tu-ilmenau.de
|
|
Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
|
|
Theoretical Physics II, University of Technology Ilmenau, Germany
|
|
|
|
See the README file in the USER-CUDA directory.
|
|
|
|
This software is distributed under the GNU General Public License.
|
|
------------------------------------------------------------------------- */
|
|
|
|
__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer)
|
|
{
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
int* list=sendlist+iswap*maxlistlength;
|
|
if(i<n)
|
|
{
|
|
int j=list[i];
|
|
if(j>_nmax) _flag[0]=1;
|
|
((X_FLOAT*) buffer)[i]=_x[j] + dx;
|
|
((X_FLOAT*) buffer)[i+1*n] = _x[j+_nmax] + dy;
|
|
((X_FLOAT*) buffer)[i+2*n] = _x[j+2*_nmax] + dz;
|
|
}
|
|
}
|
|
|
|
__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer)
|
|
{
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
int* list=sendlist+iswap*maxlistlength;
|
|
if(i<n)
|
|
{
|
|
int j=list[i];
|
|
if(j>_nmax) _flag[0]=1;
|
|
((X_FLOAT*) buffer)[i]=_x[j] + dx;
|
|
((X_FLOAT*) buffer)[i+1*n] = _x[j+_nmax] + dy;
|
|
((X_FLOAT*) buffer)[i+2*n] = _x[j+2*_nmax] + dz;
|
|
((X_FLOAT*) buffer)[i+3*n]=_v[j];
|
|
((X_FLOAT*) buffer)[i+4*n] = _v[j+_nmax];
|
|
((X_FLOAT*) buffer)[i+5*n] = _v[j+2*_nmax];
|
|
}
|
|
}
|
|
|
|
__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
|
|
{
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
|
|
int* list=sendlist+iswap*maxlistlength;
|
|
if(i<n)
|
|
{
|
|
int j=i;
|
|
j=list[i];
|
|
|
|
_x[i+first]=_x[j] + dx;
|
|
_x[i+first+_nmax] = _x[j+_nmax] + dy;
|
|
_x[i+first+2*_nmax] = _x[j+2*_nmax] + dz;
|
|
}
|
|
}
|
|
|
|
__global__ void Cuda_CommCuda_PackCommVel_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
|
|
{
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
|
|
int* list=sendlist+iswap*maxlistlength;
|
|
if(i<n)
|
|
{
|
|
int j=i;
|
|
j=list[i];
|
|
|
|
_x[i+first]=_x[j] + dx;
|
|
_x[i+first+_nmax] = _x[j+_nmax] + dy;
|
|
_x[i+first+2*_nmax] = _x[j+2*_nmax] + dz;
|
|
_v[i+first]=_v[j];
|
|
_v[i+first+_nmax] = _v[j+_nmax];
|
|
_v[i+first+2*_nmax] = _v[j+2*_nmax];
|
|
}
|
|
}
|
|
|
|
__global__ void Cuda_CommCuda_UnpackComm_Kernel(int n,int first,void* buffer)
|
|
{
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
if(i<n)
|
|
{
|
|
_x[i+first]=((X_FLOAT*) buffer)[i];
|
|
_x[i+first+_nmax]=((X_FLOAT*) buffer)[i+1*n];
|
|
_x[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+2*n];
|
|
}
|
|
}
|
|
|
|
|
|
__global__ void Cuda_CommCuda_UnpackCommVel_Kernel(int n,int first,void* buffer)
|
|
{
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
if(i<n)
|
|
{
|
|
_x[i+first]=((X_FLOAT*) buffer)[i];
|
|
_x[i+first+_nmax]=((X_FLOAT*) buffer)[i+1*n];
|
|
_x[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+2*n];
|
|
_v[i+first]=((X_FLOAT*) buffer)[i+3*n];
|
|
_v[i+first+_nmax]=((X_FLOAT*) buffer)[i+4*n];
|
|
_v[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+5*n];
|
|
}
|
|
}
|
|
|
|
__global__ void Cuda_CommCuda_PackReverse_Kernel(int n,int first)
|
|
{
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
if(i<n)
|
|
{
|
|
((F_FLOAT*) _buffer)[i]=_f[i+first];
|
|
((F_FLOAT*) _buffer)[i+n] = _f[i+first+_nmax];
|
|
((F_FLOAT*) _buffer)[i+2*n] = _f[i+first+2*_nmax];
|
|
}
|
|
|
|
}
|
|
|
|
__global__ void Cuda_CommCuda_UnpackReverse_Kernel(int* sendlist,int n,int maxlistlength,int iswap)
|
|
{
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
int* list=sendlist+iswap*maxlistlength;
|
|
if(i<n)
|
|
{
|
|
int j=list[i];
|
|
_f[j]+=((F_FLOAT*)_buffer)[i];
|
|
_f[j+_nmax]+=((F_FLOAT*) _buffer)[i+n];
|
|
_f[j+2*_nmax]+=((F_FLOAT*) _buffer)[i+2*n];
|
|
}
|
|
|
|
}
|
|
|
|
__global__ void Cuda_CommCuda_UnpackReverse_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,int first)
|
|
{
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
int* list=sendlist+iswap*maxlistlength;
|
|
if(i<n)
|
|
{
|
|
int j=list[i];
|
|
|
|
_f[j]+=_f[i+first];
|
|
_f[j+_nmax]+=_f[i+first+_nmax];
|
|
_f[j+2*_nmax]+=_f[i+first+2*_nmax];
|
|
}
|
|
|
|
}
|
|
|
|
extern __shared__ int shared[];
|
|
|
|
__global__ void Cuda_CommCuda_BuildSendlist_Single(int bordergroup,int ineed,int atom_nfirst,
|
|
int nfirst,int nlast,int dim,int iswap,X_FLOAT* slablo, X_FLOAT* slabhi,int* sendlist,int maxlistlength)
|
|
{
|
|
int* list=sendlist+iswap*maxlistlength;
|
|
X_FLOAT lo=slablo[iswap];
|
|
X_FLOAT hi=slabhi[iswap];
|
|
bool add=false;
|
|
if (!bordergroup || ineed >= 2) {
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x+nfirst;
|
|
if(i<nlast)
|
|
if (_x[i+dim*_nmax] >= lo && _x[i+dim*_nmax] <= hi) {
|
|
add=true;
|
|
}
|
|
shared[threadIdx.x]=add?1:0;
|
|
|
|
__syncthreads();
|
|
|
|
int nsend=0;
|
|
if(threadIdx.x==0)
|
|
{
|
|
for(int k=0;k<blockDim.x;k++)
|
|
{
|
|
if(shared[k]) {nsend++; shared[k]=nsend;}
|
|
}
|
|
shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
|
|
if(add&&nsend<maxlistlength)
|
|
list[nsend] = i;
|
|
|
|
|
|
} else {
|
|
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
if(i<atom_nfirst)
|
|
if (_x[i+dim*_nmax] >= lo && _x[i+dim*_nmax] <= hi) {
|
|
add=true;
|
|
}
|
|
|
|
shared[threadIdx.x]=add?1:0;
|
|
|
|
__syncthreads();
|
|
|
|
int nsend=0;
|
|
if(threadIdx.x==0)
|
|
{
|
|
for(int k=0;k<blockDim.x;k++)
|
|
{
|
|
if(shared[k]) {nsend++; shared[k]=nsend;}
|
|
}
|
|
shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
|
|
if(add&&nsend<maxlistlength)
|
|
list[nsend] = i;
|
|
|
|
__syncthreads();
|
|
|
|
add=false;
|
|
i+=_nlocal;
|
|
if(i < nlast)
|
|
if (_x[i+dim*_nmax] >= lo && _x[i+dim*_nmax] <= hi) {
|
|
add=true;
|
|
}
|
|
shared[threadIdx.x]=add?1:0;
|
|
|
|
__syncthreads();
|
|
|
|
nsend=0;
|
|
if(threadIdx.x==0)
|
|
{
|
|
for(int k=0;k<blockDim.x;k++)
|
|
{
|
|
if(shared[k]) {nsend++; shared[k]=nsend;}
|
|
}
|
|
shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
|
|
if(add&&nsend<maxlistlength)
|
|
list[nsend] = i;
|
|
|
|
}
|
|
}
|
|
|
|
|
|
__global__ void Cuda_CommCuda_BuildSendlist_Multi(int bordergroup,int ineed,int atom_nfirst
|
|
,int nfirst,int nlast,int dim,int iswap,X_FLOAT* multilo, X_FLOAT* multihi,int* sendlist,int maxlistlength)
|
|
{
|
|
int* list=sendlist+iswap*maxlistlength;
|
|
X_FLOAT* mlo=&multilo[iswap*_cuda_ntypes];
|
|
X_FLOAT* mhi=&multihi[iswap*_cuda_ntypes];
|
|
int itype=0;
|
|
bool add=false;
|
|
if (!bordergroup || ineed >= 2) {
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x+nfirst;
|
|
if(i<nlast)
|
|
{
|
|
itype=_type[i];
|
|
if (_x[i+dim*_nmax] >= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) {
|
|
add=true;
|
|
}
|
|
}
|
|
shared[threadIdx.x]=add?1:0;
|
|
|
|
__syncthreads();
|
|
|
|
int nsend=0;
|
|
if(threadIdx.x==0)
|
|
{
|
|
for(int k=0;k<blockDim.x;k++)
|
|
{
|
|
if(shared[k]) {nsend++; shared[k]=nsend;}
|
|
}
|
|
shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
|
|
if(add&&nsend<maxlistlength)
|
|
list[nsend] = i;
|
|
|
|
|
|
} else {
|
|
|
|
int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
|
|
if(i<atom_nfirst)
|
|
{
|
|
itype=_type[i];
|
|
if (_x[i+dim*_nmax] >= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) {
|
|
add=true;
|
|
}
|
|
}
|
|
shared[threadIdx.x]=add?1:0;
|
|
|
|
__syncthreads();
|
|
|
|
int nsend=0;
|
|
if(threadIdx.x==0)
|
|
{
|
|
for(int k=0;k<blockDim.x;k++)
|
|
{
|
|
if(shared[k]) {nsend++; shared[k]=nsend;}
|
|
}
|
|
shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
|
|
if(add&&nsend<maxlistlength)
|
|
list[nsend] = i;
|
|
|
|
__syncthreads();
|
|
|
|
add=false;
|
|
i+=_nlocal;
|
|
if(i < nlast)
|
|
{
|
|
itype = _type[i];
|
|
if (_x[i+dim*_nmax] >= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) {
|
|
add=true;
|
|
}
|
|
}
|
|
shared[threadIdx.x]=add?1:0;
|
|
|
|
__syncthreads();
|
|
|
|
nsend=0;
|
|
if(threadIdx.x==0)
|
|
{
|
|
for(int k=0;k<blockDim.x;k++)
|
|
{
|
|
if(shared[k]) {nsend++; shared[k]=nsend;}
|
|
}
|
|
shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
|
|
if(add&&nsend<maxlistlength)
|
|
list[nsend] = i;
|
|
|
|
}
|
|
}
|