From 41bf44183841217c3442a7db0559875c46576a23 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Fri, 2 Aug 2013 15:03:02 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@10520 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/USER-CUDA/atom_vec_angle_cuda.cpp | 12 ++--- src/USER-CUDA/atom_vec_full_cuda.cpp | 12 ++--- src/USER-CUDA/comm_cuda.cpp | 56 +++++++++---------- src/USER-CUDA/cuda.cpp | 32 +++++------ src/USER-CUDA/cuda_data.h | 72 ++++++++++++------------- src/USER-CUDA/fft3d_cuda.cpp | 6 +-- src/USER-CUDA/fix_shake_cuda.cpp | 8 +-- src/USER-CUDA/pppm_cuda.cpp | 52 +++++++++--------- src/USER-CUDA/verlet_cuda.cpp | 78 +++++++++++++-------------- 9 files changed, 164 insertions(+), 164 deletions(-) diff --git a/src/USER-CUDA/atom_vec_angle_cuda.cpp b/src/USER-CUDA/atom_vec_angle_cuda.cpp index 4179094730..82a5b77dd2 100644 --- a/src/USER-CUDA/atom_vec_angle_cuda.cpp +++ b/src/USER-CUDA/atom_vec_angle_cuda.cpp @@ -330,8 +330,8 @@ int AtomVecAngleCuda::pack_exchange(int dim, double *buf) int m = Cuda_AtomVecAngleCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data()); - timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_times time1,time2; + my_gettime(CLOCK_REALTIME,&time1); double* buf_p=*buf_pointer; for(int j=0;jshared_data.cuda_timings.comm_exchange_cpu_pack+= time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; @@ -423,8 +423,8 @@ int AtomVecAngleCuda::unpack_exchange(double *buf) int m = nsend_atoms*NCUDAEXCHANGE + 1; nlocal+=naccept; - timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_times time1,time2; + my_gettime(CLOCK_REALTIME,&time1); for(int j=0;j (buf[j+1]); } - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+= time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; diff --git a/src/USER-CUDA/atom_vec_full_cuda.cpp b/src/USER-CUDA/atom_vec_full_cuda.cpp index 4b859290c5..c430298cd6 100644 --- a/src/USER-CUDA/atom_vec_full_cuda.cpp +++ b/src/USER-CUDA/atom_vec_full_cuda.cpp @@ -331,8 +331,8 @@ int AtomVecFullCuda::pack_exchange(int dim, double *buf) int m = Cuda_AtomVecFullCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data()); - timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_times time1,time2; + my_gettime(CLOCK_REALTIME,&time1); double* buf_p=*buf_pointer; for(int j=0;jshared_data.cuda_timings.comm_exchange_cpu_pack+= time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; @@ -446,8 +446,8 @@ int AtomVecFullCuda::unpack_exchange(double *buf) int m = nsend_atoms*NCUDAEXCHANGE + 1; nlocal+=naccept; - timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_times time1,time2; + my_gettime(CLOCK_REALTIME,&time1); for(int j=0;j (buf[j+1]); } - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+= time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; diff --git a/src/USER-CUDA/comm_cuda.cpp b/src/USER-CUDA/comm_cuda.cpp index 0c2e8e8ff0..aea8f73424 100644 --- a/src/USER-CUDA/comm_cuda.cpp +++ b/src/USER-CUDA/comm_cuda.cpp @@ -175,7 +175,7 @@ void CommCuda::forward_comm_cuda() static int count=0; static double kerneltime=0.0; static double copytime=0.0; - timespec time1,time2,time3; + my_times time1,time2,time3; int n; MPI_Request request; @@ -214,13 +214,13 @@ void CommCuda::forward_comm_cuda() size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double); else size_forward_recv_now=size_forward_recv[iswap]; -clock_gettime(CLOCK_REALTIME,&time1); +my_gettime(CLOCK_REALTIME,&time1); MPI_Irecv(buf_recv,size_forward_recv_now,MPI_DOUBLE, recvproc[iswap],0,world,&request); n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*) buf_send,pbc[iswap],pbc_flag[iswap]); -clock_gettime(CLOCK_REALTIME,&time2); +my_gettime(CLOCK_REALTIME,&time2); if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used n=(n+1)*sizeof(X_FLOAT)/sizeof(double); @@ -229,7 +229,7 @@ clock_gettime(CLOCK_REALTIME,&time2); MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); MPI_Wait(&request,&status); -clock_gettime(CLOCK_REALTIME,&time3); +my_gettime(CLOCK_REALTIME,&time3); cuda->shared_data.cuda_timings.comm_forward_mpi_upper+= time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000; cuda->shared_data.cuda_timings.comm_forward_mpi_lower+= @@ -307,7 +307,7 @@ void CommCuda::forward_comm_pack_cuda() static int count=0; static double kerneltime=0.0; static double copytime=0.0; - timespec time1,time2,time3; + my_times time1,time2,time3; int n; // initialize comm buffers & exchange memory MPI_Request request; @@ -335,12 +335,12 @@ void CommCuda::forward_comm_pack_cuda() { -clock_gettime(CLOCK_REALTIME,&time1); +my_gettime(CLOCK_REALTIME,&time1); // n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*) cuda->shared_data.comm.buf_send[iswap],pbc[iswap],pbc_flag[iswap]); n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*)buf_send,pbc[iswap],pbc_flag[iswap]); -clock_gettime(CLOCK_REALTIME,&time2); +my_gettime(CLOCK_REALTIME,&time2); if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used n=(n+1)*sizeof(X_FLOAT)/sizeof(double); @@ -348,11 +348,11 @@ clock_gettime(CLOCK_REALTIME,&time2); } else if (ghost_velocity) { -clock_gettime(CLOCK_REALTIME,&time1); +my_gettime(CLOCK_REALTIME,&time1); // n = Cuda_CommCuda_PackComm_Vel(&cuda->shared_data,sendnum[iswap],iswap,(void*) &buf_send[iswap*maxsend],pbc[iswap],pbc_flag[iswap]); -clock_gettime(CLOCK_REALTIME,&time2); +my_gettime(CLOCK_REALTIME,&time2); if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used n=(n+1)*sizeof(X_FLOAT)/sizeof(double); @@ -410,7 +410,7 @@ void CommCuda::forward_comm_transfer_cuda() static int count=0; static double kerneltime=0.0; static double copytime=0.0; - timespec time1,time2,time3; + my_times time1,time2,time3; int n; MPI_Request request; MPI_Status status; @@ -453,27 +453,27 @@ void CommCuda::forward_comm_transfer_cuda() //printf("B: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4); CudaWrapper_DownloadCudaDataAsync((void*) buf_send, cuda->shared_data.comm.buf_send_dev[iswap], cuda->shared_data.comm.send_size[iswap]*sizeof(double),2); //MPI_Send(cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world); -clock_gettime(CLOCK_REALTIME,&time1); +my_gettime(CLOCK_REALTIME,&time1); CudaWrapper_SyncStream(2); //printf("C: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4); -clock_gettime(CLOCK_REALTIME,&time2); +my_gettime(CLOCK_REALTIME,&time2); cuda->shared_data.cuda_timings.comm_forward_download+= time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; MPI_Send(buf_send,cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world); MPI_Wait(&request,&status); //printf("D: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4); CudaWrapper_UploadCudaDataAsync((void*) buf_recv,cuda->shared_data.comm.buf_recv_dev[iswap], size_forward_recv_now*sizeof(double),2); -clock_gettime(CLOCK_REALTIME,&time1); +my_gettime(CLOCK_REALTIME,&time1); CudaWrapper_SyncStream(2); //printf("E: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4); //memcpy(cuda->shared_data.comm.buf_recv[iswap],buf_recv,size_forward_recv_now*sizeof(double)); //printf("RecvSize: %i SendSize: %i\n",size_forward_recv_now*sizeof(double),cuda->shared_data.comm.send_size[iswap]*sizeof(double)); -clock_gettime(CLOCK_REALTIME,&time3); +my_gettime(CLOCK_REALTIME,&time3); cuda->shared_data.cuda_timings.comm_forward_upload+= time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000; cuda->shared_data.cuda_timings.comm_forward_mpi_lower+= time3.tv_sec-time2.tv_sec+1.0*(time3.tv_nsec-time2.tv_nsec)/1000000000; -clock_gettime(CLOCK_REALTIME,&time3); +my_gettime(CLOCK_REALTIME,&time3); cuda->shared_data.cuda_timings.comm_forward_mpi_upper+= time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000; } @@ -486,17 +486,17 @@ cuda->shared_data.cuda_timings.comm_forward_mpi_upper+= else size_forward_recv_now=size_forward_recv[iswap]; -clock_gettime(CLOCK_REALTIME,&time1); +my_gettime(CLOCK_REALTIME,&time1); MPI_Irecv(cuda->shared_data.comm.buf_recv[iswap],size_forward_recv_now,MPI_DOUBLE, recvproc[iswap],0,world,&request); -clock_gettime(CLOCK_REALTIME,&time2); +my_gettime(CLOCK_REALTIME,&time2); MPI_Send(cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world); MPI_Wait(&request,&status); -clock_gettime(CLOCK_REALTIME,&time3); +my_gettime(CLOCK_REALTIME,&time3); cuda->shared_data.cuda_timings.comm_forward_mpi_upper+= time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000; cuda->shared_data.cuda_timings.comm_forward_mpi_lower+= @@ -548,7 +548,7 @@ void CommCuda::forward_comm_unpack_cuda() static int count=0; static double kerneltime=0.0; static double copytime=0.0; - timespec time1,time2,time3; + my_times time1,time2,time3; int n; MPI_Request request; MPI_Status status; @@ -762,7 +762,7 @@ void CommCuda::exchange_cuda() MPI_Request request; MPI_Status status; AtomVec *avec = atom->avec; - timespec time1,time2,time3; + my_times time1,time2,time3; // clear global->local map for owned and ghost atoms // b/c atoms migrate to new procs in exchange() and @@ -805,7 +805,7 @@ void CommCuda::exchange_cuda() // if 2 procs in dimension, single send/recv // if more than 2 procs in dimension, send/recv to both neighbors - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); if (procgrid[dim] == 1) { nrecv = nsend; @@ -841,7 +841,7 @@ void CommCuda::exchange_cuda() //printf("nsend: %i nrecv: %i\n",nsend,nrecv); // check incoming atoms to see if they are in my box // if so, add to my list -clock_gettime(CLOCK_REALTIME,&time2); +my_gettime(CLOCK_REALTIME,&time2); cuda->shared_data.cuda_timings.comm_exchange_mpi+= time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; @@ -902,7 +902,7 @@ void CommCuda::borders_cuda() MPI_Request request; MPI_Status status; AtomVec *avec = atom->avec; - timespec time1,time2,time3; + my_times time1,time2,time3; // clear old ghosts @@ -966,7 +966,7 @@ void CommCuda::borders_cuda() // put incoming ghosts at end of my atom arrays // if swapping with self, simply copy, no messages -clock_gettime(CLOCK_REALTIME,&time1); +my_gettime(CLOCK_REALTIME,&time1); if (sendproc[iswap] != me) { MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0, &nrecv,1,MPI_INT,recvproc[iswap],0,world,&status); @@ -982,7 +982,7 @@ clock_gettime(CLOCK_REALTIME,&time1); buf = buf_send; } -clock_gettime(CLOCK_REALTIME,&time2); +my_gettime(CLOCK_REALTIME,&time2); cuda->shared_data.cuda_timings.comm_border_mpi+= time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; @@ -1037,7 +1037,7 @@ void CommCuda::borders_cuda_overlap_forward_comm() MPI_Request request; MPI_Status status; AtomVec *avec = atom->avec; - timespec time1,time2,time3; + my_times time1,time2,time3; // clear old ghosts @@ -1102,7 +1102,7 @@ void CommCuda::borders_cuda_overlap_forward_comm() // put incoming ghosts at end of my atom arrays // if swapping with self, simply copy, no messages -clock_gettime(CLOCK_REALTIME,&time1); +my_gettime(CLOCK_REALTIME,&time1); if (sendproc[iswap] != me) { MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0, &nrecv,1,MPI_INT,recvproc[iswap],0,world,&status); @@ -1118,7 +1118,7 @@ clock_gettime(CLOCK_REALTIME,&time1); buf = buf_send; } -clock_gettime(CLOCK_REALTIME,&time2); +my_gettime(CLOCK_REALTIME,&time2); cuda->shared_data.cuda_timings.comm_border_mpi+= time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000; diff --git a/src/USER-CUDA/cuda.cpp b/src/USER-CUDA/cuda.cpp index b4515716ad..1160c8cbf2 100644 --- a/src/USER-CUDA/cuda.cpp +++ b/src/USER-CUDA/cuda.cpp @@ -629,12 +629,12 @@ void Cuda::evsetup_eatom_vatom(int eflag_atom, int vflag_atom) void Cuda::uploadAll() { MYDBG(printf("# CUDA: Cuda::uploadAll() ... start\n");) - timespec starttime; - timespec endtime; + my_times starttime; + my_times endtime; if(atom->nmax != shared_data.atom.nmax) checkResize(); - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); cu_x ->upload(); cu_v ->upload(); cu_f ->upload(); @@ -663,7 +663,7 @@ void Cuda::uploadAll() if(cu_vatom) cu_vatom->upload(); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); uploadtime += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000); CUDA_IF_BINNING(Cuda_PreBinning(& shared_data);) CUDA_IF_BINNING(Cuda_Binning(& shared_data);) @@ -675,13 +675,13 @@ void Cuda::uploadAll() void Cuda::downloadAll() { MYDBG(printf("# CUDA: Cuda::downloadAll() ... start\n");) - timespec starttime; - timespec endtime; + my_times starttime; + my_times endtime; if(atom->nmax != shared_data.atom.nmax) checkResize(); CUDA_IF_BINNING(Cuda_ReverseBinning(& shared_data);) - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); cu_x ->download(); cu_v ->download(); cu_f ->download(); @@ -713,7 +713,7 @@ void Cuda::downloadAll() if(cu_vatom) cu_vatom->download(); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); downloadtime += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000); MYDBG(printf("# CUDA: Cuda::downloadAll() ... end\n");) } @@ -721,12 +721,12 @@ void Cuda::downloadAll() void Cuda::upload(int datamask) { MYDBG(printf("# CUDA: Cuda::upload() ... start\n");) - timespec starttime; - timespec endtime; + my_times starttime; + my_times endtime; if(atom->nmax != shared_data.atom.nmax) checkResize(); - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); if(X_MASK & datamask) cu_x ->upload(); if(V_MASK & datamask) cu_v ->upload(); if(F_MASK & datamask) cu_f ->upload(); @@ -766,7 +766,7 @@ void Cuda::upload(int datamask) if(cu_vatom) cu_vatom->upload(); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); uploadtime += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000); MYDBG(printf("# CUDA: Cuda::upload() ... end\n");) } @@ -774,13 +774,13 @@ void Cuda::upload(int datamask) void Cuda::download(int datamask) { MYDBG(printf("# CUDA: Cuda::download() ... start\n");) - timespec starttime; - timespec endtime; + my_times starttime; + my_times endtime; if(atom->nmax != shared_data.atom.nmax) checkResize(); CUDA_IF_BINNING(Cuda_ReverseBinning(& shared_data);) - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); if(X_MASK & datamask) cu_x ->download(); if(V_MASK & datamask) cu_v ->download(); if(F_MASK & datamask) cu_f ->download(); @@ -820,7 +820,7 @@ void Cuda::download(int datamask) if(cu_vatom) cu_vatom->download(); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); downloadtime += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000); MYDBG(printf("# CUDA: Cuda::download() ... end\n");) } diff --git a/src/USER-CUDA/cuda_data.h b/src/USER-CUDA/cuda_data.h index ed8a9ff7aa..988f3c2d4d 100644 --- a/src/USER-CUDA/cuda_data.h +++ b/src/USER-CUDA/cuda_data.h @@ -281,9 +281,9 @@ void cCudaData else { timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned i=0; idim[0]; ++i) temp_data[i] = static_cast(host_data[i]); - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufUploadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes); @@ -298,9 +298,9 @@ void cCudaData else { timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned i=0; idim[0]; ++i) temp_data[i] = static_cast(host_data[i]); - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufUploadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes); @@ -311,7 +311,7 @@ void cCudaData case xy: { timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned i=0; idim[0]; ++i) { dev_type* temp = &temp_data[i * dev_data_array->dim[1]]; @@ -320,7 +320,7 @@ void cCudaData temp[j] = static_cast((reinterpret_cast(host_data))[i][j]); } } - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufUploadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes); @@ -330,7 +330,7 @@ void cCudaData case yx: { timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned j=0; jdim[1]; ++j) { dev_type* temp = &temp_data[j*dev_data_array->dim[0]]; @@ -339,7 +339,7 @@ void cCudaData temp[i] = static_cast(reinterpret_cast(host_data)[i][j]); } } - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufUploadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes); @@ -348,7 +348,7 @@ void cCudaData case xyz: { timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned i=0; idim[0]; ++i) for(unsigned j=0; jdim[1]; ++j) { @@ -358,7 +358,7 @@ void cCudaData temp[k] = static_cast(reinterpret_cast(host_data)[i][j][k]); } } - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufUploadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes); @@ -368,7 +368,7 @@ void cCudaData case xzy: { timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned i=0; idim[0]; ++i) for(unsigned k=0; kdim[2]; ++k) { @@ -378,7 +378,7 @@ void cCudaData temp[j] = static_cast(reinterpret_cast(host_data)[i][j][k]); } } - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufUploadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes); @@ -459,9 +459,9 @@ void cCudaData else { timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned i=0; idim[0]; ++i) temp_data[i] = static_cast(host_data[i]); - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufUploadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream); @@ -476,9 +476,9 @@ void cCudaData else { timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned i=0; idim[0]; ++i) temp_data[i] = static_cast(host_data[i]); - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufUploadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream); @@ -489,7 +489,7 @@ void cCudaData case xy: { timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned i=0; idim[0]; ++i) { dev_type* temp = &temp_data[i * dev_data_array->dim[1]]; @@ -498,7 +498,7 @@ void cCudaData temp[j] = static_cast((reinterpret_cast(host_data))[i][j]); } } - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufUploadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream); @@ -508,7 +508,7 @@ void cCudaData case yx: { timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned j=0; jdim[1]; ++j) { dev_type* temp = &temp_data[j*dev_data_array->dim[0]]; @@ -517,7 +517,7 @@ void cCudaData temp[i] = static_cast(reinterpret_cast(host_data)[i][j]); } } - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufUploadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream); @@ -526,7 +526,7 @@ void cCudaData case xyz: { timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned i=0; idim[0]; ++i) for(unsigned j=0; jdim[1]; ++j) { @@ -536,7 +536,7 @@ void cCudaData temp[k] = static_cast(reinterpret_cast(host_data)[i][j][k]); } } - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufUploadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream); @@ -546,7 +546,7 @@ void cCudaData case xzy: { timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned i=0; idim[0]; ++i) for(unsigned k=0; kdim[2]; ++k) { @@ -556,7 +556,7 @@ void cCudaData temp[j] = static_cast(reinterpret_cast(host_data)[i][j][k]); } } - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufUploadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream); @@ -585,9 +585,9 @@ void cCudaData { CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes); timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned i=0; idim[0]; ++i) host_data[i] = static_cast(temp_data[i]); - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufDownloadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); } @@ -602,9 +602,9 @@ void cCudaData { CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes); timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned i=0; idim[0]; ++i) host_data[i] = static_cast(temp_data[i]); - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufDownloadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); } @@ -615,7 +615,7 @@ void cCudaData { CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes); timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned i=0; idim[0]; ++i) { dev_type* temp = &temp_data[i * dev_data_array->dim[1]]; @@ -624,7 +624,7 @@ void cCudaData reinterpret_cast(host_data)[i][j] = static_cast(temp[j]); } } - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufDownloadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); break; @@ -634,7 +634,7 @@ void cCudaData { CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes); timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned j=0; jdim[1]; ++j) { dev_type* temp = &temp_data[j*dev_data_array->dim[0]]; @@ -643,7 +643,7 @@ void cCudaData reinterpret_cast(host_data)[i][j] = static_cast(temp[i]); } } - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufDownloadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); break; @@ -653,7 +653,7 @@ void cCudaData { CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes); timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned i=0; idim[0]; ++i) for(unsigned j=0; jdim[1]; ++j) { @@ -663,7 +663,7 @@ void cCudaData reinterpret_cast(host_data)[i][j][k] = static_cast(temp[k]); } } - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufDownloadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); break; @@ -673,7 +673,7 @@ void cCudaData { CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes); timespec time1,time2; - clock_gettime(CLOCK_REALTIME,&time1); + my_gettime(CLOCK_REALTIME,&time1); for(unsigned i=0; idim[0]; ++i) for(unsigned k=0; kdim[2]; ++k) { @@ -683,7 +683,7 @@ void cCudaData reinterpret_cast(host_data)[i][j][k] = static_cast(temp[j]); } } - clock_gettime(CLOCK_REALTIME,&time2); + my_gettime(CLOCK_REALTIME,&time2); CudaWrapper_AddCPUBufDownloadTime( time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000); break; diff --git a/src/USER-CUDA/fft3d_cuda.cpp b/src/USER-CUDA/fft3d_cuda.cpp index c09654d977..bd1116e447 100644 --- a/src/USER-CUDA/fft3d_cuda.cpp +++ b/src/USER-CUDA/fft3d_cuda.cpp @@ -88,8 +88,8 @@ void fft_3d_cuda(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan { #ifdef FFT_CUFFT plan->iterate++; - timespec starttime,starttime2; - timespec endtime,endtime2; + my_times starttime,starttime2; + my_times endtime,endtime2; int i,total,length,offset,num; double norm; @@ -103,7 +103,7 @@ void fft_3d_cuda(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan if(nprocs>1) { if(plan->init) - clock_gettime(CLOCK_REALTIME,&starttime); + my_gettime(CLOCK_REALTIME,&starttime); if (plan->pre_plan) { if (plan->pre_target == 0) copy = out; else copy = plan->copy; diff --git a/src/USER-CUDA/fix_shake_cuda.cpp b/src/USER-CUDA/fix_shake_cuda.cpp index 1a7826ae1a..8391626615 100644 --- a/src/USER-CUDA/fix_shake_cuda.cpp +++ b/src/USER-CUDA/fix_shake_cuda.cpp @@ -670,8 +670,8 @@ void FixShakeCuda::pre_neighbor() void FixShakeCuda::post_force(int vflag) { - timespec starttime; - timespec endtime; + my_times starttime; + my_times endtime; if(cuda->finished_setup && neighbor_step) { @@ -715,7 +715,7 @@ void FixShakeCuda::post_force(int vflag) // loop over clusters - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); if(cuda->finished_setup) { cu_virial->upload(); @@ -739,7 +739,7 @@ void FixShakeCuda::post_force(int vflag) if((not cuda->finished_setup)) cuda->cu_f->upload(); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); if(cuda->finished_setup) time_postforce += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000); diff --git a/src/USER-CUDA/pppm_cuda.cpp b/src/USER-CUDA/pppm_cuda.cpp index 2052582c48..565f59b747 100644 --- a/src/USER-CUDA/pppm_cuda.cpp +++ b/src/USER-CUDA/pppm_cuda.cpp @@ -691,10 +691,10 @@ void PPPMCuda::compute(int eflag, int vflag) cuda_shared_atom* cu_atom = & cuda->shared_data.atom; int i; - timespec starttime; - timespec endtime; - timespec starttotal; - timespec endtotal; + my_times starttime; + my_times endtime; + my_times starttotal; + my_times endtotal; // convert atoms from box to lamda coords if (triclinic == 0) boxlo = domain->boxlo; @@ -726,23 +726,23 @@ void PPPMCuda::compute(int eflag, int vflag) cu_virial->memset_device(0); } if(eflag) cu_energy->memset_device(0); - clock_gettime(CLOCK_REALTIME,&starttotal); + my_gettime(CLOCK_REALTIME,&starttotal); // find grid points for all my particles // map my particle charge onto my local 3d density grid - clock_gettime(CLOCK_REALTIME,&starttime); + my_gettime(CLOCK_REALTIME,&starttime); particle_map(); - clock_gettime(CLOCK_REALTIME,&endtime); + my_gettime(CLOCK_REALTIME,&endtime); cuda->shared_data.cuda_timings.pppm_particle_map+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); //cu_part2grid->download(); - clock_gettime(CLOCK_REALTIME,&starttime); + my_gettime(CLOCK_REALTIME,&starttime); make_rho(); - clock_gettime(CLOCK_REALTIME,&endtime); + my_gettime(CLOCK_REALTIME,&endtime); cuda->shared_data.cuda_timings.pppm_make_rho+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); // all procs communicate density values from their ghost cells @@ -751,7 +751,7 @@ void PPPMCuda::compute(int eflag, int vflag) int nprocs=comm->nprocs; - clock_gettime(CLOCK_REALTIME,&starttime); + my_gettime(CLOCK_REALTIME,&starttime); if(nprocs>1) { @@ -765,16 +765,16 @@ void PPPMCuda::compute(int eflag, int vflag) #endif } - clock_gettime(CLOCK_REALTIME,&endtime); + my_gettime(CLOCK_REALTIME,&endtime); cuda->shared_data.cuda_timings.pppm_brick2fft+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); // compute potential gradient on my FFT grid and // portion of e_long on this proc's FFT grid // return gradients (electric fields) in 3d brick decomposition - clock_gettime(CLOCK_REALTIME,&starttime); + my_gettime(CLOCK_REALTIME,&starttime); poisson(eflag,vflag); - clock_gettime(CLOCK_REALTIME,&endtime); + my_gettime(CLOCK_REALTIME,&endtime); cuda->shared_data.cuda_timings.pppm_poisson+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); // all procs communicate E-field values to fill ghost cells @@ -785,14 +785,14 @@ void PPPMCuda::compute(int eflag, int vflag) // calculate the force on my particles - clock_gettime(CLOCK_REALTIME,&starttime); + my_gettime(CLOCK_REALTIME,&starttime); fieldforce(); - clock_gettime(CLOCK_REALTIME,&endtime); + my_gettime(CLOCK_REALTIME,&endtime); cuda->shared_data.cuda_timings.pppm_fieldforce+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); // sum energy across procs and add in volume-dependent term - clock_gettime(CLOCK_REALTIME,&endtotal); + my_gettime(CLOCK_REALTIME,&endtotal); cuda->shared_data.cuda_timings.pppm_compute+=(endtotal.tv_sec-starttotal.tv_sec+1.0*(endtotal.tv_nsec-starttotal.tv_nsec)/1000000000); if (eflag) { @@ -1303,14 +1303,14 @@ void PPPMCuda::poisson(int eflag, int vflag) return; #endif #ifdef FFT_CUFFT - timespec starttime; - timespec endtime; + my_times starttime; + my_times endtime; - clock_gettime(CLOCK_REALTIME,&starttime); + my_gettime(CLOCK_REALTIME,&starttime); fft1c->compute(density_fft,work1,1); - clock_gettime(CLOCK_REALTIME,&endtime); + my_gettime(CLOCK_REALTIME,&endtime); poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); @@ -1341,9 +1341,9 @@ void PPPMCuda::poisson(int eflag, int vflag) poisson_xgrad(nx_pppm,ny_pppm,nz_pppm); - clock_gettime(CLOCK_REALTIME,&starttime); + my_gettime(CLOCK_REALTIME,&starttime); fft2c->compute(work2,work2,-1); - clock_gettime(CLOCK_REALTIME,&endtime); + my_gettime(CLOCK_REALTIME,&endtime); poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); poisson_vdx_brick(nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,nx_pppm,ny_pppm,nz_pppm); @@ -1353,9 +1353,9 @@ void PPPMCuda::poisson(int eflag, int vflag) poisson_ygrad(nx_pppm,ny_pppm,nz_pppm); - clock_gettime(CLOCK_REALTIME,&starttime); + my_gettime(CLOCK_REALTIME,&starttime); fft2c->compute(work2,work2,-1); - clock_gettime(CLOCK_REALTIME,&endtime); + my_gettime(CLOCK_REALTIME,&endtime); poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); poisson_vdy_brick(nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,nx_pppm,ny_pppm,nz_pppm); @@ -1364,9 +1364,9 @@ void PPPMCuda::poisson(int eflag, int vflag) poisson_zgrad(nx_pppm,ny_pppm,nz_pppm); - clock_gettime(CLOCK_REALTIME,&starttime); + my_gettime(CLOCK_REALTIME,&starttime); fft2c->compute(work2,work2,-1); - clock_gettime(CLOCK_REALTIME,&endtime); + my_gettime(CLOCK_REALTIME,&endtime); poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000); poisson_vdz_brick(nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,nx_pppm,ny_pppm,nz_pppm); diff --git a/src/USER-CUDA/verlet_cuda.cpp b/src/USER-CUDA/verlet_cuda.cpp index 9cdba7c483..975b1349a4 100644 --- a/src/USER-CUDA/verlet_cuda.cpp +++ b/src/USER-CUDA/verlet_cuda.cpp @@ -173,8 +173,8 @@ void VerletCuda::setup() if(elist_atom || vlist_atom) cuda->checkResize(); int test_BpA_vs_TpA = true; - timespec starttime; - timespec endtime; + my_times starttime; + my_times endtime; #ifdef NO_PREC_TIMING double startsec, endsec; #endif @@ -201,7 +201,7 @@ void VerletCuda::setup() #ifdef NO_PREC_TIMING startsec = 1.0 * clock() / CLOCKS_PER_SEC; #endif - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); for(int i = 0; i < StyleLoops; i++) { Cuda_Pair_GenerateXType(&cuda->shared_data); @@ -216,7 +216,7 @@ void VerletCuda::setup() CudaWrapper_Sync(); } - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); double TpAtime = endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; #ifdef NO_PREC_TIMING @@ -240,7 +240,7 @@ void VerletCuda::setup() force->pair->compute(eflag, vflag); CudaWrapper_Sync(); - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); #ifdef NO_PREC_TIMING startsec = 1.0 * clock() / CLOCKS_PER_SEC; #endif @@ -258,7 +258,7 @@ void VerletCuda::setup() CudaWrapper_Sync(); } - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); double BpAtime = endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; #ifdef NO_PREC_TIMING endsec = 1.0 * clock() / CLOCKS_PER_SEC; @@ -586,16 +586,16 @@ void VerletCuda::run(int n) int testatom = cuda->testatom; //48267; - timespec starttime; - timespec endtime; - timespec starttotal; - timespec endtotal; + my_times starttime; + my_times endtime; + my_times starttotal; + my_times endtotal; cuda->setTimingsZero(); static double testtime = 0.0; - // clock_gettime(CLOCK_REALTIME,&starttime); - // clock_gettime(CLOCK_REALTIME,&endtime); + // my_gettime(CLOCK_REALTIME,&starttime); + // my_gettime(CLOCK_REALTIME,&endtime); // testtime+=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000; // printf("Time: %lf\n",testtime);*/ @@ -692,13 +692,13 @@ void VerletCuda::run(int n) //overlap forward communication of ghost atom positions with inner force calculation (interactions between local atoms) //build communication buffers // printf("Pre forward_comm(1)\n"); - clock_gettime(CLOCK_REALTIME, &starttotal); + my_gettime(CLOCK_REALTIME, &starttotal); cuda->shared_data.atom.reneigh_flag = 0; - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); timer->stamp(); comm->forward_comm(1); timer->stamp(TIME_COMM); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); cuda->shared_data.cuda_timings.comm_forward_total += endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; @@ -721,9 +721,9 @@ void VerletCuda::run(int n) //CudaWrapper_Sync(); //download comm buffers from GPU, perform MPI communication and upload buffers again - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); comm->forward_comm(2); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); cuda->shared_data.cuda_timings.comm_forward_total += endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; timer->stamp(TIME_COMM); @@ -733,9 +733,9 @@ void VerletCuda::run(int n) timer->stamp(TIME_PAIR); //unpack communication buffers - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); comm->forward_comm(3); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); cuda->shared_data.cuda_timings.comm_forward_total += endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; @@ -745,9 +745,9 @@ void VerletCuda::run(int n) endtotal.tv_sec - starttotal.tv_sec + 1.0 * (endtotal.tv_nsec - starttotal.tv_nsec) / 1000000000; } else { //perform standard forward communication - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); comm->forward_comm(); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); cuda->shared_data.cuda_timings.comm_forward_total += endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; timer->stamp(TIME_COMM); @@ -791,13 +791,13 @@ void VerletCuda::run(int n) MYDBG(printf("# CUDA VerletCuda::iterate: neighbor exchange\n");) //perform exchange of local atoms - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); comm->exchange(); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); //special and nspecial fields of the atom data are not currently transfered via the GPU buffer might be changed in the future if(comm->nprocs > 1) { - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); if(atom->special) cuda->cu_special->upload(); @@ -805,7 +805,7 @@ void VerletCuda::run(int n) if(atom->nspecial) cuda->cu_nspecial->upload(); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); cuda->shared_data.cuda_timings.test1 += endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; } @@ -821,13 +821,13 @@ void VerletCuda::run(int n) MYDBG(printf("# CUDA VerletCuda::iterate: neighbor borders\n");) //generate ghost atom lists, and transfer ghost atom data - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); comm->borders(); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); cuda->shared_data.cuda_timings.comm_border_total += endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); //atom index maps are generated on CPU, and need to be transfered to GPU if they are used if(cuda->cu_map_array) cuda->cu_map_array->upload(); @@ -841,7 +841,7 @@ void VerletCuda::run(int n) MYDBG(printf("# CUDA VerletCuda::iterate: neighbor build\n");) timer->stamp(TIME_COMM); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); cuda->shared_data.cuda_timings.test2 += endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; @@ -888,8 +888,8 @@ void VerletCuda::run(int n) //regenerate data layout for force computations, its actually only needed for the ghost atoms cuda->shared_data.comm.comm_phase = 2; - timespec atime1, atime2; - clock_gettime(CLOCK_REALTIME, &atime1); + my_times atime1, atime2; + my_gettime(CLOCK_REALTIME, &atime1); Cuda_Pair_GenerateXType(&cuda->shared_data); @@ -899,7 +899,7 @@ void VerletCuda::run(int n) if(cuda->cu_omega_rmass) Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data); - clock_gettime(CLOCK_REALTIME, &atime2); + my_gettime(CLOCK_REALTIME, &atime2); cuda->shared_data.cuda_timings.pair_xtype_conversion += atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000; force->pair->compute(eflag, vflag); @@ -909,8 +909,8 @@ void VerletCuda::run(int n) if(not cuda->shared_data.pair.cudable_force) cuda->downloadAll(); else { //regenerate data layout for force computations, its actually only needed for the ghost atoms - timespec atime1, atime2; - clock_gettime(CLOCK_REALTIME, &atime1); + my_times atime1, atime2; + my_gettime(CLOCK_REALTIME, &atime1); Cuda_Pair_GenerateXType(&cuda->shared_data); @@ -920,7 +920,7 @@ void VerletCuda::run(int n) if(cuda->cu_omega_rmass) Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data); - clock_gettime(CLOCK_REALTIME, &atime2); + my_gettime(CLOCK_REALTIME, &atime2); cuda->shared_data.cuda_timings.pair_xtype_conversion += atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000; } @@ -967,7 +967,7 @@ void VerletCuda::run(int n) //collect forces in case pair force and bonded interactions were overlapped, and either no KSPACE or a GPU KSPACE style is used if(cuda->shared_data.pair.collect_forces_later && cuda->shared_data.pair.cudable_force && (not(force->kspace && (not cuda->shared_data.pppm.cudable_force)))) { - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); cuda->cu_f->uploadAsync(2); test_atom(testatom, "post molecular force"); @@ -989,7 +989,7 @@ void VerletCuda::run(int n) timer->stamp(TIME_PAIR); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); cuda->shared_data.cuda_timings.pair_force_collection += endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; } @@ -1020,7 +1020,7 @@ void VerletCuda::run(int n) if(cuda->shared_data.pair.collect_forces_later && cuda->shared_data.pair.cudable_force && ((force->kspace && (not cuda->shared_data.pppm.cudable_force)))) { cuda->cu_f->uploadAsync(2); - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); if(eflag) cuda->cu_eng_vdwl->upload(); @@ -1038,7 +1038,7 @@ void VerletCuda::run(int n) timer->stamp(TIME_PAIR); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); cuda->shared_data.cuda_timings.pair_force_collection += endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; }