diff --git a/src/USER-CUDA/comm_cuda.cu b/src/USER-CUDA/comm_cuda.cu deleted file mode 100644 index 0233f3ee13..0000000000 --- a/src/USER-CUDA/comm_cuda.cu +++ /dev/null @@ -1,483 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include -#define MY_PREFIX comm_cuda -#include "cuda_shared.h" -#include "cuda_common.h" - -#include "crm_cuda_utils.cu" - -#include "comm_cuda_cu.h" -#include "comm_cuda_kernel.cu" -#include - -void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata,int n) -{ - int size=n*3*sizeof(X_FLOAT); - if(sdata->buffersizebuffer,sdata->buffersize);) - CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize); - sdata->buffer = CudaWrapper_AllocCudaData(size); - sdata->buffersize=size; - sdata->buffer_new++; - MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);) - } - cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*) ); -} - - -void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata) -{ - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); -} - - -void Cuda_CommCuda_Init(cuda_shared_data* sdata) -{ - Cuda_CommCuda_UpdateNmax(sdata); - int ntypesp=sdata->atom.ntypes+1; - cudaMemcpyToSymbol(MY_CONST(cuda_ntypes) , &ntypesp, sizeof(int)); - cudaMemcpyToSymbol(MY_CONST(prd) , sdata->domain.prd, 3*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(flag) , &sdata->flag, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(debugdata) , &sdata->debugdata, sizeof(int*)); -} - -int Cuda_CommCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag) -{ - - timespec time1,time2; - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*3*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - - X_FLOAT dx=0.0; - X_FLOAT dy=0.0; - X_FLOAT dz=0.0; - if (pbc_flag != 0) { - if (sdata->domain.triclinic == 0) { - dx = pbc[0]*sdata->domain.prd[0]; - dy = pbc[1]*sdata->domain.prd[1]; - dz = pbc[2]*sdata->domain.prd[2]; - } else { - dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; - dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; - dz = pbc[2]*sdata->domain.prd[2]; - }} - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - if(sdata->atom.nlocal>0) - { - cudaMemset( sdata->flag,0,sizeof(int)); - -clock_gettime(CLOCK_REALTIME,&time1); - - void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer; - Cuda_CommCuda_PackComm_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n - ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf); - cudaThreadSynchronize(); - -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_forward_kernel_pack+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); - if(not sdata->overlap_comm) - cudaMemcpy(buf_send, sdata->buffer, n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - -clock_gettime(CLOCK_REALTIME,&time1); -sdata->cuda_timings.comm_forward_download+= - time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; - - int aflag; - cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); - if(aflag!=0) printf("aflag PackComm: %i\n",aflag); - CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); - - } - return 3*n; -} - -int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag) -{ - - timespec time1,time2; - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*6*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - - X_FLOAT dx=0.0; - X_FLOAT dy=0.0; - X_FLOAT dz=0.0; - if (pbc_flag != 0) { - if (sdata->domain.triclinic == 0) { - dx = pbc[0]*sdata->domain.prd[0]; - dy = pbc[1]*sdata->domain.prd[1]; - dz = pbc[2]*sdata->domain.prd[2]; - } else { - dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; - dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; - dz = pbc[2]*sdata->domain.prd[2]; - }} - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - if(sdata->atom.nlocal>0) - { - cudaMemset( sdata->flag,0,sizeof(int)); - -clock_gettime(CLOCK_REALTIME,&time1); - - void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer; - Cuda_CommCuda_PackComm_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n - ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf); - cudaThreadSynchronize(); - -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_forward_kernel_pack+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); - if(not sdata->overlap_comm) - cudaMemcpy(buf_send, sdata->buffer, n*6*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - -clock_gettime(CLOCK_REALTIME,&time1); -sdata->cuda_timings.comm_forward_download+= - time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; - - int aflag; - cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); - if(aflag!=0) printf("aflag PackComm: %i\n",aflag); - CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); - - } - return 6*n; -} - -int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) -{ - MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");) - timespec time1,time2; - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*3*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - static int count=-1; - count++; - X_FLOAT dx=0.0; - X_FLOAT dy=0.0; - X_FLOAT dz=0.0; - if (pbc_flag != 0) { - if (sdata->domain.triclinic == 0) { - dx = pbc[0]*sdata->domain.prd[0]; - dy = pbc[1]*sdata->domain.prd[1]; - dz = pbc[2]*sdata->domain.prd[2]; - } else { - dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; - dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; - dz = pbc[2]*sdata->domain.prd[2]; - }} - - - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - -clock_gettime(CLOCK_REALTIME,&time1); - - Cuda_CommCuda_PackComm_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first); - cudaThreadSynchronize(); - -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_forward_kernel_self+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed"); - } - - return 3*n; -} - -int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag) -{ - MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");) - timespec time1,time2; - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*6*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - static int count=-1; - count++; - X_FLOAT dx=0.0; - X_FLOAT dy=0.0; - X_FLOAT dz=0.0; - if (pbc_flag != 0) { - if (sdata->domain.triclinic == 0) { - dx = pbc[0]*sdata->domain.prd[0]; - dy = pbc[1]*sdata->domain.prd[1]; - dz = pbc[2]*sdata->domain.prd[2]; - } else { - dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz; - dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz; - dz = pbc[2]*sdata->domain.prd[2]; - }} - - - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - -clock_gettime(CLOCK_REALTIME,&time1); - - Cuda_CommCuda_PackComm_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first); - cudaThreadSynchronize(); - -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_forward_kernel_self+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed"); - } - - return 6*n; -} - -void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap) -{ - timespec time1,time2; - - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*3*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { -clock_gettime(CLOCK_REALTIME,&time1); - if(not sdata->overlap_comm||iswap<0) - cudaMemcpy(sdata->buffer,(void*)buf_recv, n*3*sizeof(X_FLOAT), cudaMemcpyHostToDevice); - -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_forward_upload+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer; - Cuda_CommCuda_UnpackComm_Kernel<<>>(n,first,buf); - cudaThreadSynchronize(); - -clock_gettime(CLOCK_REALTIME,&time1); -sdata->cuda_timings.comm_forward_kernel_unpack+= - time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed"); - - } -} - -void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap) -{ - timespec time1,time2; - - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*6*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { -clock_gettime(CLOCK_REALTIME,&time1); - - if(not sdata->overlap_comm||iswap<0) - cudaMemcpy(sdata->buffer,(void*)buf_recv, n*6*sizeof(X_FLOAT), cudaMemcpyHostToDevice); - -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_forward_upload+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer; - Cuda_CommCuda_UnpackComm_Kernel<<>>(n,first,buf); - cudaThreadSynchronize(); - -clock_gettime(CLOCK_REALTIME,&time1); -sdata->cuda_timings.comm_forward_kernel_unpack+= - time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed"); - - } -} - -int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata,int n,int first,void* buf_send) -{ - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*3*sizeof(F_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - - - F_FLOAT* buf=(F_FLOAT*)buf_send; - F_FLOAT* f_dev=(F_FLOAT*)sdata->atom.f.dev_data; - f_dev+=first; - cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost); - buf+=n; f_dev+=sdata->atom.nmax; - cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost); - buf+=n; f_dev+=sdata->atom.nmax; - cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost); - return n*3; -} - - -void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata,int n,int iswap,void* buf_recv) -{ - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*3*sizeof(F_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - if(sdata->atom.nlocal>0) - { - cudaMemcpy(sdata->buffer,buf_recv, size, cudaMemcpyHostToDevice); - Cuda_CommCuda_UnpackReverse_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_CommCuda_UnpackReverse: Kernel execution failed"); - } -} - -void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata,int n,int iswap,int first) -{ - if(sdata->atom.update_nmax) - Cuda_CommCuda_UpdateNmax(sdata); - if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - int size=n*3*sizeof(X_FLOAT); - if(sdata->buffer_new or (size>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,n); - - int3 layout=getgrid(n); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x, layout.y, 1); - - if(sdata->atom.nlocal>0) - { - Cuda_CommCuda_UnpackReverse_Self_Kernel<<>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,first); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_CommCuda_PackReverse_Self: Kernel execution failed"); - - } -} - - -int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata,int bordergroup,int ineed,int style,int atom_nfirst,int nfirst,int nlast,int dim,int iswap) -{ - MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");) - timespec time1,time2; - Cuda_CommCuda_UpdateNmax(sdata); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - if(sdata->buffer_new or (80>sdata->buffersize)) - Cuda_CommCuda_UpdateBuffer(sdata,10); - int n; - if (!bordergroup || ineed >= 2) - n=nlast-nfirst+1; - else - { - n=atom_nfirst; - if(nlast-sdata->atom.nlocal+1>n) n=nlast-sdata->atom.nlocal+1; - } - int3 layout=getgrid(n,0,512,true); - dim3 threads(layout.z, 1, 1); - dim3 grid(layout.x+1, layout.y, 1); - - - cudaMemset((int*) (sdata->buffer),0,sizeof(int)); - -clock_gettime(CLOCK_REALTIME,&time1); - if(style==1) - Cuda_CommCuda_BuildSendlist_Single<<>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.slablo.dev_data,(X_FLOAT*) sdata->comm.slabhi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength); - else - Cuda_CommCuda_BuildSendlist_Multi<<>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.multilo.dev_data,(X_FLOAT*) sdata->comm.multihi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength); - cudaThreadSynchronize(); -clock_gettime(CLOCK_REALTIME,&time2); -sdata->cuda_timings.comm_border_kernel_buildlist+= - time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; - - CUT_CHECK_ERROR("Cuda_CommCuda_BuildSendlist: Kernel execution failed"); - int nsend; - cudaMemcpy(&nsend, sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost); - return nsend; - - -} - diff --git a/src/USER-CUDA/pppm_cuda.cu b/src/USER-CUDA/pppm_cuda.cu deleted file mode 100644 index cabea885d3..0000000000 --- a/src/USER-CUDA/pppm_cuda.cu +++ /dev/null @@ -1,579 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - - Original Version: - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - See the README file in the top-level LAMMPS directory. - - ----------------------------------------------------------------------- - - USER-CUDA Package and associated modifications: - https://sourceforge.net/projects/lammpscuda/ - - Christian Trott, christian.trott@tu-ilmenau.de - Lars Winterfeld, lars.winterfeld@tu-ilmenau.de - Theoretical Physics II, University of Technology Ilmenau, Germany - - See the README file in the USER-CUDA directory. - - This software is distributed under the GNU General Public License. -------------------------------------------------------------------------- */ - -#include "cuda_precision.h" -//#define FFT_CUFFT -#define MY_PREFIX pppm -#include "cuda_shared.h" -#include "cuda_common.h" -#include "pppm_cuda_cu.h" -#include "cuda_runtime.h" -#include - -//#include "crm_cuda_utils.cu" -#define MIN(a,b) ((a) < (b) ? (a) : (b)) -#define MAX(a,b) ((a) > (b) ? (a) : (b)) - - __device__ __constant__ FFT_FLOAT* work1; - __device__ __constant__ FFT_FLOAT* work2; - __device__ __constant__ FFT_FLOAT* work3; - __device__ __constant__ PPPM_FLOAT* greensfn; - __device__ __constant__ PPPM_FLOAT* gf_b; - __device__ __constant__ PPPM_FLOAT* fkx; - __device__ __constant__ PPPM_FLOAT* fky; - __device__ __constant__ PPPM_FLOAT* fkz; - __device__ __constant__ PPPM_FLOAT* vg; - __device__ __constant__ int* part2grid; - __device__ __constant__ PPPM_FLOAT* density_brick; - __device__ __constant__ int* density_brick_int; - __device__ __constant__ PPPM_FLOAT density_intScale; - __device__ __constant__ PPPM_FLOAT* vdx_brick; - __device__ __constant__ PPPM_FLOAT* vdy_brick; - __device__ __constant__ PPPM_FLOAT* vdz_brick; - __device__ __constant__ PPPM_FLOAT* density_fft; - __device__ __constant__ ENERGY_FLOAT* energy; - __device__ __constant__ ENERGY_FLOAT* virial; - __device__ __constant__ int nxlo_in; - __device__ __constant__ int nxhi_in; - __device__ __constant__ int nxlo_out; - __device__ __constant__ int nxhi_out; - __device__ __constant__ int nylo_in; - __device__ __constant__ int nyhi_in; - __device__ __constant__ int nylo_out; - __device__ __constant__ int nyhi_out; - __device__ __constant__ int nzlo_in; - __device__ __constant__ int nzhi_in; - __device__ __constant__ int nzlo_out; - __device__ __constant__ int nzhi_out; - __device__ __constant__ int nxlo_fft; - __device__ __constant__ int nxhi_fft; - __device__ __constant__ int nylo_fft; - __device__ __constant__ int nyhi_fft; - __device__ __constant__ int nzlo_fft; - __device__ __constant__ int nzhi_fft; - __device__ __constant__ int nx_pppm; - __device__ __constant__ int ny_pppm; - __device__ __constant__ int nz_pppm; - __device__ __constant__ int slabflag; - __device__ __constant__ PPPM_FLOAT qqrd2e; - __device__ __constant__ int order; - //__device__ __constant__ float3 sublo; - __device__ __constant__ PPPM_FLOAT* rho_coeff; - __device__ __constant__ int nmax; - __device__ __constant__ int nlocal; - __device__ __constant__ PPPM_FLOAT* debugdata; - __device__ __constant__ PPPM_FLOAT delxinv; - __device__ __constant__ PPPM_FLOAT delyinv; - __device__ __constant__ PPPM_FLOAT delzinv; - __device__ __constant__ int nlower; - __device__ __constant__ int nupper; - __device__ __constant__ PPPM_FLOAT shiftone; - - -#include "pppm_cuda_kernel.cu" -#include "stdio.h" -void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial - ,void* cu_work1,void* cu_work2, void* cu_work3,void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg - ,int cu_nxlo_in,int cu_nxhi_in,int cu_nylo_in,int cu_nyhi_in,int cu_nzlo_in,int cu_nzhi_in,int cu_nxlo_out,int cu_nxhi_out,int cu_nylo_out,int cu_nyhi_out,int cu_nzlo_out,int cu_nzhi_out,int cu_nx_pppm,int cu_ny_pppm,int cu_nz_pppm - ,int cu_nxlo_fft,int cu_nxhi_fft,int cu_nylo_fft,int cu_nyhi_fft,int cu_nzlo_fft,int cu_nzhi_fft,void* cu_gf_b - ,double cu_qqrd2e, int cu_order, void* cu_rho_coeff,void* cu_debugdata,void* cu_density_brick_int,int cu_slabflag - ) -{ - CUT_CHECK_ERROR("ERROR-CUDA poisson_init Start"); - cudaMemcpyToSymbol("density_brick",&cu_density_brick, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("density_brick_int",&cu_density_brick_int, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("vdx_brick",&cu_vdx_brick, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("vdy_brick",&cu_vdy_brick, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("vdz_brick",&cu_vdz_brick, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("density_fft",&cu_density_fft, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("energy",&cu_energy, sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol("virial",&cu_virial, sizeof(ENERGY_FLOAT*)); - cudaMemcpyToSymbol("nxlo_in",&cu_nxlo_in, sizeof(int)); - cudaMemcpyToSymbol("nxhi_in",&cu_nxhi_in, sizeof(int)); - cudaMemcpyToSymbol("nxlo_out",&cu_nxlo_out, sizeof(int)); - cudaMemcpyToSymbol("nxhi_out",&cu_nxhi_out, sizeof(int)); - cudaMemcpyToSymbol("nylo_in",&cu_nylo_in, sizeof(int)); - cudaMemcpyToSymbol("nyhi_in",&cu_nyhi_in, sizeof(int)); - cudaMemcpyToSymbol("nylo_out",&cu_nylo_out, sizeof(int)); - cudaMemcpyToSymbol("nyhi_out",&cu_nyhi_out, sizeof(int)); - cudaMemcpyToSymbol("nzlo_in",&cu_nzlo_in, sizeof(int)); - cudaMemcpyToSymbol("nzhi_in",&cu_nzhi_in, sizeof(int)); - cudaMemcpyToSymbol("nzlo_out",&cu_nzlo_out, sizeof(int)); - cudaMemcpyToSymbol("nzhi_out",&cu_nzhi_out, sizeof(int)); - cudaMemcpyToSymbol("nxlo_fft",&cu_nxlo_fft, sizeof(int)); - cudaMemcpyToSymbol("nxhi_fft",&cu_nxhi_fft, sizeof(int)); - cudaMemcpyToSymbol("nylo_fft",&cu_nylo_fft, sizeof(int)); - cudaMemcpyToSymbol("nyhi_fft",&cu_nyhi_fft, sizeof(int)); - cudaMemcpyToSymbol("nzlo_fft",&cu_nzlo_fft, sizeof(int)); - cudaMemcpyToSymbol("nzhi_fft",&cu_nzhi_fft, sizeof(int)); - cudaMemcpyToSymbol("slabflag",&cu_slabflag, sizeof(int)); - cudaMemcpyToSymbol("nx_pppm",&cu_nx_pppm, sizeof(int)); - cudaMemcpyToSymbol("ny_pppm",&cu_ny_pppm, sizeof(int)); - cudaMemcpyToSymbol("nz_pppm",&cu_nz_pppm, sizeof(int)); - cudaMemcpyToSymbol("work1",&cu_work1, sizeof(FFT_FLOAT*)); - cudaMemcpyToSymbol("work2",&cu_work2, sizeof(FFT_FLOAT*)); - cudaMemcpyToSymbol("work3",&cu_work3, sizeof(FFT_FLOAT*)); - cudaMemcpyToSymbol("greensfn",&cu_greensfn, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("gf_b",&cu_gf_b, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("fkx",&cu_fkx, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("fky",&cu_fky, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("fkz",&cu_fkz, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("vg",&cu_vg, sizeof(PPPM_FLOAT*)); - - PPPM_FLOAT cu_qqrd2e_a=cu_qqrd2e; - cudaMemcpyToSymbol("qqrd2e",&cu_qqrd2e_a, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol("order",&cu_order, sizeof(int)); - cudaMemcpyToSymbol("rho_coeff",&cu_rho_coeff, sizeof(PPPM_FLOAT*)); - cudaMemcpyToSymbol("debugdata",&cu_debugdata, sizeof(PPPM_FLOAT*)); - - CUT_CHECK_ERROR("ERROR-CUDA poisson_init"); - -/*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n"); - -#ifdef PPPM_PRECISION -if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n"); -if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n"); -#endif -#ifdef ENERGY_PRECISION -if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n"); -if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n"); -#endif -#ifdef ENERGY_PRECISION -if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n"); -if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n"); -#endif -#ifdef X_PRECISION -if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n"); -if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n"); -#endif -#ifdef F_PRECISION -if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n"); -if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n"); -#endif*/ -} - -void pppm_device_init_setup(cuda_shared_data* sdata,PPPM_FLOAT cu_shiftone,PPPM_FLOAT cu_delxinv,PPPM_FLOAT cu_delyinv,PPPM_FLOAT cu_delzinv,int cu_nlower,int cu_nupper) -{ - cudaMemcpyToSymbol("delxinv",&cu_delxinv, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol("delyinv",&cu_delyinv, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol("delzinv",&cu_delzinv, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol("shiftone",&cu_shiftone, sizeof(PPPM_FLOAT)); - cudaMemcpyToSymbol("nlower",&cu_nlower, sizeof(int)); - cudaMemcpyToSymbol("nupper",&cu_nupper, sizeof(int)); - cudaMemcpyToSymbol(MY_CONST(sublo) , sdata->domain.sublo, 3*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(subhi) , sdata->domain.subhi, 3*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(boxlo) , sdata->domain.boxlo, 3*sizeof(X_FLOAT)); - CUT_CHECK_ERROR("ERROR-CUDA pppm_init_setup"); -} - -void pppm_device_update(cuda_shared_data* sdata,void* cu_part2grid, int nlocala,int nmaxa) -{ - cudaMemcpyToSymbol("part2grid",&cu_part2grid, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*)); - cudaMemcpyToSymbol(MY_CONST(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); - //cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal .dev_data, sizeof(int)); - cudaMemcpyToSymbol("nlocal" , &nlocala, sizeof(int)); - cudaMemcpyToSymbol("nmax" , &nmaxa, sizeof(int)); - CUT_CHECK_ERROR("ERROR-CUDA pppm_device_update"); - -} - -void pppm_update_nlocal(int nlocala) -{ - cudaMemcpyToSymbol("nlocal" , &nlocala, sizeof(int)); - CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b"); -} - - -void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald) -{ - dim3 grid; - dim3 threads; - grid.x=nz_pppma; - grid.y=ny_pppma; - grid.z=1; - threads.x=nx_pppma; - threads.y=1; - threads.z=1; - setup_fkxyz_vg<<>>(unitkx,unitky,unitkz,g_ewald); - cudaThreadSynchronize(); - - CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_fkxyz_vg "); -} - -void Cuda_PPPM_setup_greensfn(int nx_pppma,int ny_pppma,int nz_pppma,PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald, -int nbx,int nby,int nbz,PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab) -{ - dim3 grid; - dim3 threads; - grid.x=nz_pppma; - grid.y=ny_pppma; - grid.z=1; - threads.x=nx_pppma; - threads.y=1; - threads.z=1; - setup_greensfn<<>>(unitkx,unitky,unitkz,g_ewald,nbx,nby,nbz,xprd,yprd, zprd_slab); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_greensfn "); -} - -void poisson_scale(int nx_pppma,int ny_pppma,int nz_pppma) -{ - dim3 grid; - dim3 threads; - grid.x=nz_pppma; - grid.y=ny_pppma; - grid.z=1; - threads.x=nx_pppma; - threads.y=1; - threads.z=1; - poisson_scale_kernel<<>>(); - CUT_CHECK_ERROR("ERROR-CUDA poisson_scale "); - -} - -void poisson_xgrad(int nx_pppma,int ny_pppma,int nz_pppma) -{ - dim3 grid; - dim3 threads; - grid.x=nz_pppma; - grid.y=ny_pppma; - grid.z=1; - threads.x=nx_pppma; - threads.y=1; - threads.z=1; - poisson_xgrad_kernel<<>>(); - CUT_CHECK_ERROR("ERROR-CUDA poisson_xgrad "); -} - -void poisson_ygrad(int nx_pppma,int ny_pppma,int nz_pppma) -{ - dim3 grid; - dim3 threads; - grid.x=nz_pppma; - grid.y=ny_pppma; - grid.z=1; - threads.x=nx_pppma; - threads.y=1; - threads.z=1; - poisson_ygrad_kernel<<>>(); - CUT_CHECK_ERROR("ERROR-CUDA poisson_ygrad "); -} - -void poisson_zgrad(int nx_pppma,int ny_pppma,int nz_pppma) -{ - dim3 grid; - dim3 threads; - grid.x=nz_pppma; - grid.y=ny_pppma; - grid.z=1; - threads.x=nx_pppma; - threads.y=1; - threads.z=1; - poisson_zgrad_kernel<<>>(); - CUT_CHECK_ERROR("ERROR-CUDA poisson_zgrad "); -} - -void poisson_vdx_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppma,int ny_pppma,int nz_pppma) -{ - - dim3 grid; - dim3 threads; - grid.x=khi-klo+1; - grid.y=jhi-jlo+1; - grid.z=1; - threads.x=ihi-ilo+1; - threads.y=1; - threads.z=1; - //printf("VDX_BRICK CUDA: %i %i %i\n",grid.x,grid.y,threads.x); - poisson_vdx_brick_kernel<<>>(ilo,jlo,klo); - CUT_CHECK_ERROR("ERROR-CUDA poisson_vdxbrick "); - cudaThreadSynchronize(); -} - -void poisson_vdy_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm) -{ - dim3 grid; - dim3 threads; - grid.x=khi-klo+1; - grid.y=jhi-jlo+1; - grid.z=1; - threads.x=ihi-ilo+1; - threads.y=1; - threads.z=1; - poisson_vdy_brick_kernel<<>>(ilo,jlo,klo); - CUT_CHECK_ERROR("ERROR-CUDA poisson_vdybrick "); - cudaThreadSynchronize(); -} - -void poisson_vdz_brick(int ihi,int ilo,int jhi,int jlo,int khi,int klo,int nx_pppm,int ny_pppm,int nz_pppm) -{ - dim3 grid; - dim3 threads; - grid.x=khi-klo+1; - grid.y=jhi-jlo+1; - grid.z=1; - threads.x=ihi-ilo+1; - threads.y=1; - threads.z=1; - poisson_vdz_brick_kernel<<>>(ilo,jlo,klo); - CUT_CHECK_ERROR("ERROR-CUDA poisson_vdzbrick "); - cudaThreadSynchronize(); -} - - -void poisson_energy(int nxlo_fft,int nxhi_fft,int nylo_fft,int nyhi_fft,int nzlo_fft,int nzhi_fft,int vflag) -{ - //printf("VFLAG_GPU: %i\n",vflag); - CUT_CHECK_ERROR("ERROR-CUDA poisson_energy start "); - dim3 grid; - dim3 threads; - grid.x=nzhi_fft-nzlo_fft+1; - grid.y=nyhi_fft-nylo_fft+1; - grid.z=1; - threads.x=nxhi_fft-nxlo_fft+1; - threads.y=1; - threads.z=1; - poisson_energy_kernel<<>>(nxlo_fft,nylo_fft,nzlo_fft,vflag); - - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA poisson_energy end "); -} - -ENERGY_FLOAT sum_energy(void* cu_virial,void* cu_energy,int nx_pppma,int ny_pppma,int nz_pppma,int vflag,ENERGY_FLOAT* cpu_virial) -{ - ENERGY_FLOAT host_energy=0; - dim3 grid; - dim3 threads; - - grid.x=nz_pppma; - grid.y=1; - grid.z=1; - threads.x=ny_pppma; - threads.y=1; - threads.z=1; - sum_energy_kernel1<<>>(vflag); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel1 "); - - grid.x=1; - grid.y=1; - grid.z=1; - threads.x=nz_pppma; - threads.y=1; - threads.z=1; - sum_energy_kernel2<<>>(vflag); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel2 "); - - cudaMemcpy((void*) (&host_energy), cu_energy, sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost); - if(vflag) - cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost); - CUT_CHECK_ERROR("ERROR-CUDA sumenergy_memcopy"); - - return host_energy; -} - -void cuda_make_rho(cuda_shared_data* sdata,void* flag,PPPM_FLOAT* cu_density_intScale,int ihi,int ilo,int jhi,int jlo,int khi,int klo,void* cu_density_brick,void* cu_density_brick_int) -{ - CUT_CHECK_ERROR("cuda_make_rho begin"); - dim3 grid,threads; - int cpu_flag[3]; - grid.x=(sdata->atom.nlocal+31)/32; - grid.y=1; - grid.z=1; - threads.x=32; - threads.y=1; - threads.z=1; - int sharedmemsize=(32+32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT); - do - { - cpu_flag[0]=0; - cpu_flag[1]=0; - cpu_flag[2]=0; - cudaMemcpyToSymbol("density_intScale",cu_density_intScale,sizeof(PPPM_FLOAT*)); - CUT_CHECK_ERROR("ERROR-CUDA make_rho pre Z"); - cudaMemset(flag,0,3*sizeof(int)); - CUT_CHECK_ERROR("ERROR-CUDA make_rho pre A"); - cudaMemset(cu_density_brick,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(PPPM_FLOAT)); - CUT_CHECK_ERROR("ERROR-CUDA make_rho pre B"); - cudaMemset(cu_density_brick_int,0,(khi-klo+1)*(jhi-jlo+1)*(ihi-ilo+1)*sizeof(int)); - CUT_CHECK_ERROR("ERROR-CUDA make_rho pre C"); - make_rho_kernel<<>>((int*) flag,32/(sdata->pppm.nupper-sdata->pppm.nlower+1)); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA make_rho A"); - cudaMemcpy((void*) &cpu_flag, flag, 3*sizeof(int),cudaMemcpyDeviceToHost); - if(cpu_flag[0]!=0) {(*cu_density_intScale)/=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n",*cu_density_intScale);)} - if((cpu_flag[0]==0)&&(cpu_flag[1]==0)) {(*cu_density_intScale)*=2; MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n",*cu_density_intScale);)} - /* if((*cu_density_intScale)>0xe0000000) - { - printf("Error Scaling\n"); - cpu_flag[0]=0; - cpu_flag[1]=1; - }*/ - CUT_CHECK_ERROR("ERROR-CUDA make_rho B"); - } while((cpu_flag[0]!=0)||(cpu_flag[1]==0)); - - - grid.x=khi-klo+1; - grid.y=jhi-jlo+1; - threads.x=ihi-ilo+1; - scale_rho_kernel<<>>(); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA make_rho_scale"); -} - - -int cuda_particle_map(cuda_shared_data* sdata,void* flag) -{ - dim3 grid,threads; - int cpu_flag; - grid.x=(sdata->atom.nlocal+31)/32; - grid.y=1; - grid.z=1; - threads.x=32; - threads.y=1; - threads.z=1; - CUT_CHECK_ERROR("ERROR-CUDA particla_map ..pre"); - particle_map_kernel<<>>((int*) flag); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA particla_map a"); - cudaMemcpy((void*) &cpu_flag, flag, sizeof(int),cudaMemcpyDeviceToHost); - CUT_CHECK_ERROR("ERROR-CUDA particla_map b"); - return cpu_flag; -} - - -void cuda_fieldforce(cuda_shared_data* sdata,void* flag) -{ - dim3 grid,threads; - grid.x=(sdata->atom.nlocal+31)/32; - grid.y=1; - grid.z=1; - threads.x=32; - threads.y=1; - threads.z=1; - int sharedmemsize=(32+3*32*(sdata->pppm.nupper-sdata->pppm.nlower+1)+sdata->pppm.order*(sdata->pppm.order/2-(1-sdata->pppm.order)/2+1))*sizeof(PPPM_FLOAT); - fieldforce_kernel<<>> - (sdata->pppm.nupper-sdata->pppm.nlower+1,32/(sdata->pppm.nupper-sdata->pppm.nlower+1),(int*) flag); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA fieldforce"); -} - -double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf) -{ - dim3 grid,threads; - grid.x=(sdata->atom.nlocal+31)/32; - grid.y=1; - grid.z=1; - threads.x=32; - threads.y=1; - threads.z=1; - slabcorr_energy_kernel<<>>(dev_buf); - cudaThreadSynchronize(); - cudaMemcpy((void*) buf, dev_buf, grid.x*sizeof(ENERGY_FLOAT),cudaMemcpyDeviceToHost); - - double dipole_all=0.0; - for(int i=0;iatom.nlocal+31)/32; - grid.y=1; - grid.z=1; - threads.x=32; - threads.y=1; - threads.z=1; - slabcorr_force_kernel<<>>(ffact); - cudaThreadSynchronize(); -} - -void sum_virial(double* host_virial) -{ -} - -void pppm_initfftdata(cuda_shared_data* sdata,PPPM_FLOAT* in,FFT_FLOAT* out) -{ - int nslow=sdata->pppm.nzhi_in-sdata->pppm.nzlo_in; - int nmid=sdata->pppm.nyhi_in-sdata->pppm.nylo_in; - int nfast=sdata->pppm.nxhi_in-sdata->pppm.nxlo_in; - int nrimz=MAX(sdata->pppm.nzlo_in-sdata->pppm.nzlo_out,sdata->pppm.nzhi_out-sdata->pppm.nzhi_in); - int nrimy=MAX(sdata->pppm.nylo_in-sdata->pppm.nylo_out,sdata->pppm.nyhi_out-sdata->pppm.nyhi_in); - int nrimx=MAX(sdata->pppm.nxlo_in-sdata->pppm.nxlo_out,sdata->pppm.nxhi_out-sdata->pppm.nxhi_in); - dim3 grid; - grid.x=nslow+1; - grid.y=nmid+1; - grid.z=1; - dim3 threads; - threads.x=nfast+1; - threads.y=1; - threads.z=1; - cudaThreadSynchronize(); - initfftdata_core_kernel<<>>(in,out); - cudaThreadSynchronize(); - grid.x=nrimz; - grid.y=nmid+1; - threads.x=nfast+1; - initfftdata_z_kernel<<>>(in,out); - cudaThreadSynchronize(); - grid.x=nslow+1; - grid.y=nrimy; - threads.x=nfast+1; - initfftdata_y_kernel<<>>(in,out); - cudaThreadSynchronize(); - grid.x=nslow+1; - grid.y=nmid+1; - threads.x=nrimx; - initfftdata_x_kernel<<>>(in,out); - cudaThreadSynchronize(); - grid.x=nrimz; - grid.y=nrimy; - threads.x=nfast+1; - initfftdata_yz_kernel<<>>(in,out); - cudaThreadSynchronize(); - grid.x=nrimz; - grid.y=nmid+1; - threads.x=nrimx; - initfftdata_xz_kernel<<>>(in,out); - cudaThreadSynchronize(); - grid.x=nslow+1; - grid.y=nrimy; - threads.x=nrimx; - initfftdata_xy_kernel<<>>(in,out); - cudaThreadSynchronize(); - grid.x=nrimz; - grid.y=nrimy; - threads.x=nrimx; - initfftdata_xyz_kernel<<>>(in,out); - cudaThreadSynchronize(); - CUT_CHECK_ERROR("ERROR-CUDA initfftdata_kernel"); -} - -