From 860bca7d5e5d4d545fc43a3ef776e0bd604b35e9 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Mon, 24 Oct 2011 17:49:51 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7181 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/cuda/atom_vec_cuda.cu | 45 +++++++++------ lib/cuda/comm_cuda.cu | 14 +++-- lib/cuda/compute_temp_cuda.cu | 6 +- lib/cuda/compute_temp_cuda_kernel.cu | 14 ++--- lib/cuda/compute_temp_partial_cuda.cu | 4 +- lib/cuda/compute_temp_partial_cuda_kernel.cu | 14 ++--- lib/cuda/cuda_pair.cu | 58 ++++++++++++-------- lib/cuda/domain_kernel.cu | 2 +- lib/cuda/pair_eam_cuda.cu | 24 +++++--- 9 files changed, 108 insertions(+), 73 deletions(-) diff --git a/lib/cuda/atom_vec_cuda.cu b/lib/cuda/atom_vec_cuda.cu index 187718dc36..3bee50d6ef 100644 --- a/lib/cuda/atom_vec_cuda.cu +++ b/lib/cuda/atom_vec_cuda.cu @@ -78,12 +78,12 @@ void Cuda_AtomVecCuda_UpdateNmax(cuda_shared_data* sdata) cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); cudaMemcpyToSymbol(MY_CONST(mask) , & sdata->atom.mask .dev_data, sizeof(int*) ); cudaMemcpyToSymbol(MY_CONST(image) , & sdata->atom.image.dev_data, sizeof(int*) ); - if(data_mask & Q_MASK) cudaMemcpyToSymbol(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*) ); - if(data_mask & MOLECULE_MASK) cudaMemcpyToSymbol(MY_CONST(molecule) , & sdata->atom.molecule.dev_data, sizeof(int*) ); - if(data_mask & RADIUS_MASK) cudaMemcpyToSymbol(MY_CONST(radius) , & sdata->atom.radius.dev_data, sizeof(int*) ); - if(data_mask & DENSITY_MASK) cudaMemcpyToSymbol(MY_CONST(density) , & sdata->atom.density.dev_data, sizeof(int*) ); - if(data_mask & RMASS_MASK) cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(int*) ); - if(data_mask & OMEGA_MASK) cudaMemcpyToSymbol(MY_CONST(omega) , & sdata->atom.omega.dev_data, sizeof(int*) ); + if(data_mask & Q_MASK) cudaMemcpyToSymbolAsync(MY_CONST(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*) ); + if(data_mask & MOLECULE_MASK) cudaMemcpyToSymbolAsync(MY_CONST(molecule) , & sdata->atom.molecule.dev_data, sizeof(int*) ); + if(data_mask & RADIUS_MASK) cudaMemcpyToSymbolAsync(MY_CONST(radius) , & sdata->atom.radius.dev_data, sizeof(int*) ); + if(data_mask & DENSITY_MASK) cudaMemcpyToSymbolAsync(MY_CONST(density) , & sdata->atom.density.dev_data, sizeof(int*) ); + if(data_mask & RMASS_MASK) cudaMemcpyToSymbolAsync(MY_CONST(rmass) , & sdata->atom.rmass.dev_data, sizeof(int*) ); + if(data_mask & OMEGA_MASK) cudaMemcpyToSymbolAsync(MY_CONST(omega) , & sdata->atom.omega.dev_data, sizeof(int*) ); //if(data_mask & NSPECIAL_MASK) cudaMemcpyToSymbol(MY_CONST(nspecial) , & sdata->atom.nspecial.dev_data, sizeof(int*) ); cudaMemcpyToSymbol(MY_CONST(flag) , & sdata->flag, sizeof(int*) ); } @@ -92,12 +92,16 @@ template void Cuda_AtomVecCuda_Init(cuda_shared_data* sdata) { MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... start\n"); ) + if(sdata->atom.update_nmax) Cuda_AtomVecCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbolAsync(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... post Nmax\n"); ) - cudaMemcpyToSymbol(MY_CONST(prd) , sdata->domain.prd, 3*sizeof(X_FLOAT)); - cudaMemcpyToSymbol(MY_CONST(sublo) , & sdata->domain.sublo, 3*sizeof(X_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(subhi) , & sdata->domain.subhi, 3*sizeof(X_FLOAT) ); - cudaMemcpyToSymbol(MY_CONST(flag) , & sdata->flag, sizeof(int*) ); + cudaMemcpyToSymbolAsync(MY_CONST(prd) , sdata->domain.prd, 3*sizeof(X_FLOAT)); + cudaMemcpyToSymbolAsync(MY_CONST(sublo) , & sdata->domain.sublo, 3*sizeof(X_FLOAT) ); + cudaMemcpyToSymbolAsync(MY_CONST(subhi) , & sdata->domain.subhi, 3*sizeof(X_FLOAT) ); + cudaMemcpyToSymbolAsync(MY_CONST(flag) , & sdata->flag, sizeof(int*) ); + cudaThreadSynchronize(); MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... end\n"); ) } @@ -110,7 +114,7 @@ int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_ if(sdata->atom.update_nmax) Cuda_AtomVecCuda_UpdateNmax(sdata); if(sdata->atom.update_nlocal) - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbolAsync(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); int n_data_items=AtomVecCuda_CountDataItems(data_mask); int size=(n*n_data_items)*sizeof(X_FLOAT); if(sdata->buffer_new or (size>sdata->buffersize)) @@ -265,6 +269,7 @@ template int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send) { MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... start dim %i \n",dim); ) + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: pre Kernel execution failed"); cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); Cuda_AtomVecCuda_Init(sdata); int size=n*sizeof(double); @@ -280,7 +285,7 @@ int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void timespec time1,time2; clock_gettime(CLOCK_REALTIME,&time1); - Cuda_AtomVecCuda_PackExchangeList_Kernel<<>>(n-1,dim); + Cuda_AtomVecCuda_PackExchangeList_Kernel<<>>(n-1,dim); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed"); @@ -290,7 +295,9 @@ int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void cudaMemcpy(buf_send, sdata->buffer, sizeof(double), cudaMemcpyDeviceToHost); int return_value = ((int*) buf_send)[0]; - cudaMemcpy(buf_send, sdata->buffer, (1+return_value)*sizeof(double), cudaMemcpyDeviceToHost); + if(n>1+return_value) + cudaMemcpy(buf_send, sdata->buffer, (1+return_value)*sizeof(double), cudaMemcpyDeviceToHost); + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: return copy failed"); clock_gettime(CLOCK_REALTIME,&time1); sdata->cuda_timings.comm_exchange_download+= @@ -304,9 +311,11 @@ template int Cuda_AtomVecCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist) { MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... start \n"); ) + if(sdata->atom.update_nmax) Cuda_AtomVecCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - + int n_data_items=AtomVecCuda_CountDataItems(data_mask)+1; int size=(nsend*n_data_items+1)*sizeof(double); if(sdata->buffer_new or (size>sdata->buffersize)) @@ -323,7 +332,7 @@ int Cuda_AtomVecCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_se Cuda_AtomVecCuda_PackExchange_Kernel<<>>(nsend,(int*) copylist); cudaThreadSynchronize(); - CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed"); + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchange: Kernel execution failed"); clock_gettime(CLOCK_REALTIME,&time2); sdata->cuda_timings.comm_exchange_kernel_pack+= @@ -335,7 +344,7 @@ int Cuda_AtomVecCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_se sdata->cuda_timings.comm_exchange_download+= time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000; - MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... done\n"); ) + MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... done\n"); ) return nsend*n_data_items+1; } @@ -393,6 +402,7 @@ int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void if(sdata->atom.update_nmax) Cuda_AtomVecCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); clock_gettime(CLOCK_REALTIME,&atime2); sdata->cuda_timings.test1+= @@ -451,6 +461,7 @@ int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int if(sdata->atom.update_nmax) Cuda_AtomVecCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); int n_data_items=AtomVecCuda_CountDataItems(data_mask); @@ -503,7 +514,7 @@ int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* if(sdata->atom.update_nmax) Cuda_AtomVecCuda_UpdateNmax(sdata); - //if(sdata->atom.update_nlocal) + if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); clock_gettime(CLOCK_REALTIME,&atime2); sdata->cuda_timings.test1+= diff --git a/lib/cuda/comm_cuda.cu b/lib/cuda/comm_cuda.cu index 0233f3ee13..dc7c01005d 100644 --- a/lib/cuda/comm_cuda.cu +++ b/lib/cuda/comm_cuda.cu @@ -50,12 +50,12 @@ void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata,int n) void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata) { - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); + cudaMemcpyToSymbolAsync(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbolAsync(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); + cudaMemcpyToSymbolAsync(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbolAsync(MY_CONST(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*) ); + cudaMemcpyToSymbolAsync(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); + cudaMemcpyToSymbolAsync(MY_CONST(type) , & sdata->atom.type .dev_data, sizeof(int*) ); } @@ -444,7 +444,9 @@ int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata,int bordergroup,int inee { MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");) timespec time1,time2; + if(sdata->atom.update_nmax) Cuda_CommCuda_UpdateNmax(sdata); + if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); if(sdata->buffer_new or (80>sdata->buffersize)) Cuda_CommCuda_UpdateBuffer(sdata,10); diff --git a/lib/cuda/compute_temp_cuda.cu b/lib/cuda/compute_temp_cuda.cu index bb3fa5ce2a..4ade926461 100644 --- a/lib/cuda/compute_temp_cuda.cu +++ b/lib/cuda/compute_temp_cuda.cu @@ -83,8 +83,9 @@ void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FL cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: compute_vector Kernel execution failed"); - int oldgrid=grid.x; + int oldgrid=grid.x*grid.y; grid.x=6; + grid.y=1; threads.x=512; Cuda_ComputeTempCuda_Reduce_Kernel<<>> (oldgrid,t); cudaThreadSynchronize(); @@ -111,8 +112,9 @@ void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FL cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: compute_scalar Kernel execution failed"); - int oldgrid=grid.x; + int oldgrid=grid.x*grid.y; grid.x=1; + grid.y=1; threads.x=512; Cuda_ComputeTempCuda_Reduce_Kernel<<>> (oldgrid,t); cudaThreadSynchronize(); diff --git a/lib/cuda/compute_temp_cuda_kernel.cu b/lib/cuda/compute_temp_cuda_kernel.cu index 3e97148f6b..c5de884cd1 100644 --- a/lib/cuda/compute_temp_cuda_kernel.cu +++ b/lib/cuda/compute_temp_cuda_kernel.cu @@ -42,7 +42,7 @@ __global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit) ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer; if(threadIdx.x==0) { - buffer[blockIdx.x]=sharedmem[0]; + buffer[(blockIdx.x*gridDim.y+blockIdx.y)]=sharedmem[0]; } } @@ -76,12 +76,12 @@ __global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit) ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer; if(threadIdx.x==0) { - buffer[blockIdx.x]=sharedmem[0]; - buffer[blockIdx.x+gridDim.x]=sharedmem[blockDim.x]; - buffer[blockIdx.x+2*gridDim.x]=sharedmem[2*blockDim.x]; - buffer[blockIdx.x+3*gridDim.x]=sharedmem[3*blockDim.x]; - buffer[blockIdx.x+4*gridDim.x]=sharedmem[4*blockDim.x]; - buffer[blockIdx.x+5*gridDim.x]=sharedmem[5*blockDim.x]; + buffer[(blockIdx.x*gridDim.y+blockIdx.y)]=sharedmem[0]; + buffer[(blockIdx.x*gridDim.y+blockIdx.y)+gridDim.x*gridDim.y]=sharedmem[blockDim.x]; + buffer[(blockIdx.x*gridDim.y+blockIdx.y)+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x]; + buffer[(blockIdx.x*gridDim.y+blockIdx.y)+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x]; + buffer[(blockIdx.x*gridDim.y+blockIdx.y)+4*gridDim.x*gridDim.y]=sharedmem[4*blockDim.x]; + buffer[(blockIdx.x*gridDim.y+blockIdx.y)+5*gridDim.x*gridDim.y]=sharedmem[5*blockDim.x]; } } diff --git a/lib/cuda/compute_temp_partial_cuda.cu b/lib/cuda/compute_temp_partial_cuda.cu index 07e19936f1..94a4fa9ea3 100644 --- a/lib/cuda/compute_temp_partial_cuda.cu +++ b/lib/cuda/compute_temp_partial_cuda.cu @@ -83,7 +83,7 @@ void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit,EN cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: compute_vector Kernel execution failed"); - int oldgrid=grid.x; + int oldgrid=grid.x*grid.y; grid.x=6; threads.x=512; Cuda_ComputeTempPartialCuda_Reduce_Kernel<<>> (oldgrid,t); @@ -111,7 +111,7 @@ void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit,EN cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: compute_scalar Kernel execution failed"); - int oldgrid=grid.x; + int oldgrid=grid.x*grid.y; grid.x=1; threads.x=512; Cuda_ComputeTempPartialCuda_Reduce_Kernel<<>> (oldgrid,t); diff --git a/lib/cuda/compute_temp_partial_cuda_kernel.cu b/lib/cuda/compute_temp_partial_cuda_kernel.cu index c14c3a06a2..7c7895ca43 100644 --- a/lib/cuda/compute_temp_partial_cuda_kernel.cu +++ b/lib/cuda/compute_temp_partial_cuda_kernel.cu @@ -42,7 +42,7 @@ __global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit,int xflag ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer; if(threadIdx.x==0) { - buffer[blockIdx.x]=sharedmem[0]; + buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0]; } } @@ -76,12 +76,12 @@ __global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit,int xflag ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer; if(threadIdx.x==0) { - buffer[blockIdx.x]=sharedmem[0]; - buffer[blockIdx.x+gridDim.x]=sharedmem[blockDim.x]; - buffer[blockIdx.x+2*gridDim.x]=sharedmem[2*blockDim.x]; - buffer[blockIdx.x+3*gridDim.x]=sharedmem[3*blockDim.x]; - buffer[blockIdx.x+4*gridDim.x]=sharedmem[4*blockDim.x]; - buffer[blockIdx.x+5*gridDim.x]=sharedmem[5*blockDim.x]; + buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0]; + buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x]; + buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x]; + buffer[blockIdx.x*gridDim.y+blockIdx.y+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x]; + buffer[blockIdx.x*gridDim.y+blockIdx.y+4*gridDim.x*gridDim.y]=sharedmem[4*blockDim.x]; + buffer[blockIdx.x*gridDim.y+blockIdx.y+5*gridDim.x*gridDim.y]=sharedmem[5*blockDim.x]; } } diff --git a/lib/cuda/cuda_pair.cu b/lib/cuda/cuda_pair.cu index b7b2523529..1d7a439220 100644 --- a/lib/cuda/cuda_pair.cu +++ b/lib/cuda/cuda_pair.cu @@ -208,29 +208,33 @@ void Cuda_UpdateBuffer(cuda_shared_data* sdata,int size) CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles failed"); } +void Cuda_Pair_UpdateNeighbor_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + //Neighbor + cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned) ); + cudaMemcpyToSymbol(MY_CONST(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(overlap_comm) , & sdata->overlap_comm, sizeof(int) ); + +if(sdata->overlap_comm) +{ + cudaMemcpyToSymbol(MY_CONST(numneigh_border) , & sneighlist->numneigh_border .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(numneigh_inner) , & sneighlist->numneigh_inner .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(neighbors_inner) , & sneighlist->neighbors_inner .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(ilist_border) , & sneighlist->ilist_border .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_CONST(inum_border) , & sneighlist->inum_border .dev_data, sizeof(int*) ); +} + +} //Update constants after nmax change which are generally needed by all pair styles void Cuda_Pair_UpdateNmax_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) { CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: Begin"); - //Neighbor - cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(overlap_comm) , & sdata->overlap_comm, sizeof(int) ); - - if(sdata->overlap_comm) - { - cudaMemcpyToSymbol(MY_CONST(numneigh_border) , & sneighlist->numneigh_border .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(numneigh_inner) , & sneighlist->numneigh_inner .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(neighbors_inner) , & sneighlist->neighbors_inner .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(ilist_border) , & sneighlist->ilist_border .dev_data, sizeof(int*)); - cudaMemcpyToSymbol(MY_CONST(inum_border) , & sneighlist->inum_border .dev_data, sizeof(int*) ); - } //System cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); @@ -757,6 +761,8 @@ timespec startpairtime, endpairtime; //Function which is called prior to kernel invocation, determins grid, Binds Textures, updates constant memory if necessary void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist,int eflag, int vflag, dim3& grid, dim3& threads, int& sharedperproc,bool need_q=false,int maxthreads=256) { + if(sdata->atom.update_neigh) + Cuda_Pair_UpdateNeighbor_AllStyles(sdata,sneighlist); if(sdata->atom.update_nmax) Cuda_Pair_UpdateNmax_AllStyles(sdata,sneighlist); if(sdata->atom.update_nlocal) @@ -880,7 +886,7 @@ void Cuda_Pair_UpdateNmax(cuda_shared_data* sdata) cudaMemcpyToSymbol(MY_CONST(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*) ); cudaMemcpyToSymbol(MY_CONST(omega) , & sdata->atom.omega .dev_data, sizeof(V_FLOAT*) ); cudaMemcpyToSymbol(MY_CONST(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*) ); - cudaMemcpyToSymbol(MY_CONST(omega_rmass),& sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*) ); + cudaMemcpyToSymbol(MY_CONST(omega_rmass),& sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*) ); CUT_CHECK_ERROR("Cuda_Pair: updateNmax failed"); } @@ -888,9 +894,13 @@ void Cuda_Pair_UpdateNmax(cuda_shared_data* sdata) void Cuda_Pair_GenerateXType(cuda_shared_data* sdata) { MYDBG(printf(" # CUDA: GenerateXType ... start %i %i %i %p %p %p %p\n",sdata->atom.nlocal,sdata->atom.nall,sdata->atom.nmax,sdata->atom.x.dev_data,sdata->atom.x_type.dev_data,sdata->atom.xhold.dev_data,sdata->atom.type.dev_data); ) + if(sdata->atom.update_nmax) Cuda_Pair_UpdateNmax(sdata); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); + if(sdata->atom.update_nlocal) + { + cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); + } MYDBG(printf(" # CUDA: GenerateXType ... getgrid\n"); fflush(stdout); ) int3 layout=getgrid(sdata->atom.nall); @@ -907,6 +917,7 @@ void Cuda_Pair_GenerateXType(cuda_shared_data* sdata) void Cuda_Pair_RevertXType(cuda_shared_data* sdata) { MYDBG(printf(" # CUDA: RevertXType ... start\n"); ) + if(sdata->atom.update_nmax) Cuda_Pair_UpdateNmax(sdata); cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); @@ -924,6 +935,7 @@ void Cuda_Pair_RevertXType(cuda_shared_data* sdata) void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata) { MYDBG(printf(" # CUDA: GenerateVRadius ... start %i %i %i %p %p %p %p\n",sdata->atom.nlocal,sdata->atom.nall,sdata->atom.nmax,sdata->atom.x.dev_data,sdata->atom.x_type.dev_data,sdata->atom.xhold.dev_data,sdata->atom.type.dev_data); ) + if(sdata->atom.update_nmax) Cuda_Pair_UpdateNmax(sdata); cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); @@ -943,6 +955,7 @@ void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata) void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata) { MYDBG(printf(" # CUDA: GenerateOmegaRmass ... start %i %i %i %p %p %p %p\n",sdata->atom.nlocal,sdata->atom.nall,sdata->atom.nmax,sdata->atom.x.dev_data,sdata->atom.x_type.dev_data,sdata->atom.xhold.dev_data,sdata->atom.type.dev_data); ) + if(sdata->atom.update_nmax) Cuda_Pair_UpdateNmax(sdata); cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); @@ -961,6 +974,7 @@ void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata) void Cuda_Pair_BuildXHold(cuda_shared_data* sdata) { + if(sdata->atom.update_nmax) Cuda_Pair_UpdateNmax(sdata); cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); cudaMemcpyToSymbol(MY_CONST(nall) , & sdata->atom.nall , sizeof(int) ); diff --git a/lib/cuda/domain_kernel.cu b/lib/cuda/domain_kernel.cu index ec5ef897c1..fa76974076 100644 --- a/lib/cuda/domain_kernel.cu +++ b/lib/cuda/domain_kernel.cu @@ -205,7 +205,7 @@ __global__ void Domain_PBC_Kernel(int deform_remap,int deform_groupbit,int box_c maxz=sharedmem[0]; __syncthreads(); } - else {minx=lo[2];maxx=hi[2];} + else {minz=lo[2];maxz=hi[2];} if(threadIdx.x==0) { buf=(X_FLOAT*) _buffer; diff --git a/lib/cuda/pair_eam_cuda.cu b/lib/cuda/pair_eam_cuda.cu index 29ad4af271..d97143a0c7 100644 --- a/lib/cuda/pair_eam_cuda.cu +++ b/lib/cuda/pair_eam_cuda.cu @@ -134,18 +134,22 @@ void Cuda_PairEAMCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlis CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateBuffer failed"); } +void Cuda_PairEAMCuda_UpdateNeighbor(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ +cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned) ); +cudaMemcpyToSymbol(MY_CONST(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*) ); +cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) ); +cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) ); +cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); +cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); +cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) ); +cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*) ); +cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) ); +} + void Cuda_PairEAMCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) { CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed"); - cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned) ); - cudaMemcpyToSymbol(MY_CONST(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(ilist) , & sneighlist->ilist .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(inum) , & sneighlist->inum , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(nmax) , & sdata->atom.nmax , sizeof(int) ); - cudaMemcpyToSymbol(MY_CONST(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*) ); - cudaMemcpyToSymbol(MY_CONST(maxneighbors) , & sneighlist->maxneighbors , sizeof(int) ); cudaMemcpyToSymbol(MY_CONST(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*) ); cudaMemcpyToSymbol(MY_CONST(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*) ); cudaMemcpyToSymbol(MY_CONST(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*) ); @@ -228,6 +232,8 @@ void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlis if(sdata->atom.update_nmax) Cuda_PairEAMCuda_UpdateNmax(sdata,sneighlist); + if(sdata->atom.update_neigh) + Cuda_PairEAMCuda_UpdateNeighbor(sdata,sneighlist); if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int) ); if(sdata->buffer_new)