From 9adfd0cac12f020ed4ba6edaf9b843bd0f711b62 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Fri, 2 Aug 2013 15:02:54 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@10519 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/cuda/atom_vec_cuda.cu | 80 +++++++++++++++++------------------ lib/cuda/comm_cuda.cu | 50 +++++++++++----------- lib/cuda/cuda_pair.cu | 8 ++-- lib/cuda/cuda_precision.h | 9 ++-- lib/cuda/cuda_wrapper.cu | 12 +++--- lib/cuda/fix_nh_cuda.cu | 24 +++++------ lib/cuda/neighbor.cu | 16 +++---- lib/cuda/pair_sw_cuda.cu | 10 ++--- lib/cuda/pair_tersoff_cuda.cu | 10 ++--- 9 files changed, 111 insertions(+), 108 deletions(-) diff --git a/lib/cuda/atom_vec_cuda.cu b/lib/cuda/atom_vec_cuda.cu index 091fb7dbf1..1bed9b2089 100644 --- a/lib/cuda/atom_vec_cuda.cu +++ b/lib/cuda/atom_vec_cuda.cu @@ -134,7 +134,7 @@ template int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag) { - timespec time1, time2; + my_times time1, time2; if(sdata->atom.update_nmax) Cuda_AtomVecCuda_UpdateNmax(sdata); @@ -171,14 +171,14 @@ int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* b if(sdata->atom.nlocal > 0) { cudaMemset(sdata->flag, 0, sizeof(int)); - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer; Cuda_AtomVecCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n , sdata->comm.maxlistlength, iswap, dx, dy, dz, buf); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_forward_kernel_pack += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; @@ -188,7 +188,7 @@ int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* b cudaMemcpy(buf_send, sdata->buffer, n* n_data_items* sizeof(X_FLOAT), cudaMemcpyDeviceToHost); //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); sdata->cuda_timings.comm_forward_download += time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; @@ -207,7 +207,7 @@ template int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag) { MYDBG(printf(" # CUDA: AtomVecCuda_PackComm_Self\n");) - timespec time1, time2; + my_times time1, time2; if(sdata->atom.update_nmax) Cuda_AtomVecCuda_UpdateNmax(sdata); @@ -247,13 +247,13 @@ int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, in if(sdata->atom.nlocal > 0) { - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self:Pre Kernel execution failed"); Cuda_AtomVecCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_forward_kernel_self += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; @@ -267,7 +267,7 @@ int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, in template void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap) { - timespec time1, time2; + my_times time1, time2; if(sdata->atom.update_nmax) Cuda_AtomVecCuda_UpdateNmax(sdata); @@ -286,19 +286,19 @@ void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void dim3 grid(layout.x, layout.y, 1); if(sdata->atom.nlocal > 0) { - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); if(not sdata->overlap_comm || iswap < 0) cudaMemcpy(sdata->buffer, (void*)buf_recv, n_data_items * n * sizeof(X_FLOAT), cudaMemcpyHostToDevice); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_forward_upload += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer; Cuda_AtomVecCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); sdata->cuda_timings.comm_forward_kernel_unpack += time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; @@ -325,14 +325,14 @@ int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, v dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); - timespec time1, time2; - clock_gettime(CLOCK_REALTIME, &time1); + my_times time1, time2; + my_gettime(CLOCK_REALTIME, &time1); Cuda_AtomVecCuda_PackExchangeList_Kernel <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (n - 1, dim); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed"); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_exchange_kernel_pack += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; @@ -344,7 +344,7 @@ int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, v CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: return copy failed"); - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); sdata->cuda_timings.comm_exchange_download += time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; @@ -375,20 +375,20 @@ int Cuda_AtomVecCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_ dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); - timespec time1, time2; - clock_gettime(CLOCK_REALTIME, &time1); + my_times time1, time2; + my_gettime(CLOCK_REALTIME, &time1); Cuda_AtomVecCuda_PackExchange_Kernel <<< grid, threads, 0>>>(nsend, (int*) copylist); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchange: Kernel execution failed"); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_exchange_kernel_pack += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost); - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); sdata->cuda_timings.comm_exchange_download += time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; @@ -419,19 +419,19 @@ int Cuda_AtomVecCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* bu dim3 grid(layout.x, layout.y, 1); if(sdata->atom.nlocal > 0) { - timespec time1, time2; - clock_gettime(CLOCK_REALTIME, &time1); + my_times time1, time2; + my_gettime(CLOCK_REALTIME, &time1); cudaMemcpy(sdata->buffer, buf_send , size, cudaMemcpyHostToDevice); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_exchange_upload += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; Cuda_AtomVecCuda_UnpackExchange_Kernel <<< grid, threads, 0>>>(sdata->exchange_dim, nsend, (int*) copylist); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); sdata->cuda_timings.comm_exchange_kernel_unpack += time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; @@ -448,8 +448,8 @@ int Cuda_AtomVecCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* bu template int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag) { - timespec atime1, atime2; - clock_gettime(CLOCK_REALTIME, &atime1); + my_times atime1, atime2; + my_gettime(CLOCK_REALTIME, &atime1); if(sdata->atom.update_nmax) Cuda_AtomVecCuda_UpdateNmax(sdata); @@ -457,7 +457,7 @@ int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, v if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - clock_gettime(CLOCK_REALTIME, &atime2); + my_gettime(CLOCK_REALTIME, &atime2); sdata->cuda_timings.test1 += atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000; @@ -489,20 +489,20 @@ int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, v dim3 grid(layout.x, layout.y, 1); if(sdata->atom.nlocal > 0) { - timespec time1, time2; - clock_gettime(CLOCK_REALTIME, &time1); + my_times time1, time2; + my_gettime(CLOCK_REALTIME, &time1); Cuda_AtomVecCuda_PackBorder_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, nsend, sdata->comm.maxlistlength, iswap, dx, dy, dz); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_border_kernel_pack += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost); CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder: Kernel execution failed"); - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); sdata->cuda_timings.comm_border_download += time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; @@ -548,13 +548,13 @@ int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, dim3 grid(layout.x, layout.y, 1); if(sdata->atom.nlocal > 0) { - timespec time1, time2; - clock_gettime(CLOCK_REALTIME, &time1); + my_times time1, time2; + my_gettime(CLOCK_REALTIME, &time1); Cuda_AtomVecCuda_PackBorder_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_border_kernel_self += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; @@ -569,8 +569,8 @@ int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, template int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv) { - timespec atime1, atime2; - clock_gettime(CLOCK_REALTIME, &atime1); + my_times atime1, atime2; + my_gettime(CLOCK_REALTIME, &atime1); if(sdata->atom.update_nmax) Cuda_AtomVecCuda_UpdateNmax(sdata); @@ -578,7 +578,7 @@ int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, voi if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - clock_gettime(CLOCK_REALTIME, &atime2); + my_gettime(CLOCK_REALTIME, &atime2); sdata->cuda_timings.test1 += atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000; @@ -594,20 +594,20 @@ int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, voi dim3 grid(layout.x, layout.y, 1); if(sdata->atom.nlocal > 0) { - timespec time1, time2; - clock_gettime(CLOCK_REALTIME, &time1); + my_times time1, time2; + my_gettime(CLOCK_REALTIME, &time1); cudaMemset((int*)(sdata->flag), 0, sizeof(int)); cudaMemcpy(sdata->buffer, (void*)buf_recv, size, cudaMemcpyHostToDevice); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_border_upload += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; Cuda_AtomVecCuda_UnpackBorder_Kernel <<< grid, threads, 0>>>(n, first); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); sdata->cuda_timings.comm_border_kernel_unpack += time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; diff --git a/lib/cuda/comm_cuda.cu b/lib/cuda/comm_cuda.cu index 8ca2d63cac..ec95a8bfeb 100644 --- a/lib/cuda/comm_cuda.cu +++ b/lib/cuda/comm_cuda.cu @@ -73,7 +73,7 @@ void Cuda_CommCuda_Init(cuda_shared_data* sdata) int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag) { - timespec time1, time2; + my_times time1, time2; if(sdata->atom.update_nmax) Cuda_CommCuda_UpdateNmax(sdata); @@ -109,14 +109,14 @@ int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_ if(sdata->atom.nlocal > 0) { cudaMemset(sdata->flag, 0, sizeof(int)); - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer; Cuda_CommCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n , sdata->comm.maxlistlength, iswap, dx, dy, dz, buf); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_forward_kernel_pack += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; @@ -126,7 +126,7 @@ int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_ cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost); //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); sdata->cuda_timings.comm_forward_download += time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; @@ -143,7 +143,7 @@ int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_ int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag) { - timespec time1, time2; + my_times time1, time2; if(sdata->atom.update_nmax) Cuda_CommCuda_UpdateNmax(sdata); @@ -179,14 +179,14 @@ int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* b if(sdata->atom.nlocal > 0) { cudaMemset(sdata->flag, 0, sizeof(int)); - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer; Cuda_CommCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n , sdata->comm.maxlistlength, iswap, dx, dy, dz, buf); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_forward_kernel_pack += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; @@ -196,7 +196,7 @@ int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* b cudaMemcpy(buf_send, sdata->buffer, n * 6 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost); //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); sdata->cuda_timings.comm_forward_download += time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; @@ -213,7 +213,7 @@ int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* b int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag) { MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");) - timespec time1, time2; + my_times time1, time2; if(sdata->atom.update_nmax) Cuda_CommCuda_UpdateNmax(sdata); @@ -252,12 +252,12 @@ int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int f if(sdata->atom.nlocal > 0) { - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); Cuda_CommCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_forward_kernel_self += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; @@ -270,7 +270,7 @@ int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int f int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag) { MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");) - timespec time1, time2; + my_times time1, time2; if(sdata->atom.update_nmax) Cuda_CommCuda_UpdateNmax(sdata); @@ -309,12 +309,12 @@ int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, in if(sdata->atom.nlocal > 0) { - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); Cuda_CommCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_forward_kernel_self += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; @@ -326,7 +326,7 @@ int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, in void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap) { - timespec time1, time2; + my_times time1, time2; if(sdata->atom.update_nmax) Cuda_CommCuda_UpdateNmax(sdata); @@ -344,19 +344,19 @@ void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* b dim3 grid(layout.x, layout.y, 1); if(sdata->atom.nlocal > 0) { - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); if(not sdata->overlap_comm || iswap < 0) cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_FLOAT), cudaMemcpyHostToDevice); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_forward_upload += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer; Cuda_CommCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); sdata->cuda_timings.comm_forward_kernel_unpack += time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; @@ -367,7 +367,7 @@ void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* b void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap) { - timespec time1, time2; + my_times time1, time2; if(sdata->atom.update_nmax) Cuda_CommCuda_UpdateNmax(sdata); @@ -385,19 +385,19 @@ void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void dim3 grid(layout.x, layout.y, 1); if(sdata->atom.nlocal > 0) { - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); if(not sdata->overlap_comm || iswap < 0) cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 6 * sizeof(X_FLOAT), cudaMemcpyHostToDevice); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_forward_upload += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer; Cuda_CommCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); sdata->cuda_timings.comm_forward_kernel_unpack += time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; @@ -489,7 +489,7 @@ void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata, int n, int iswap, int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int ineed, int style, int atom_nfirst, int nfirst, int nlast, int dim, int iswap) { MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");) - timespec time1, time2; + my_times time1, time2; if(sdata->atom.update_nmax) Cuda_CommCuda_UpdateNmax(sdata); @@ -517,7 +517,7 @@ int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int in cudaMemset((int*)(sdata->buffer), 0, sizeof(int)); - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); if(style == 1) Cuda_CommCuda_BuildSendlist_Single <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_FLOAT*) sdata->comm.slablo.dev_data, (X_FLOAT*) sdata->comm.slabhi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength); @@ -525,7 +525,7 @@ int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int in Cuda_CommCuda_BuildSendlist_Multi <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_FLOAT*) sdata->comm.multilo.dev_data, (X_FLOAT*) sdata->comm.multihi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.comm_border_kernel_buildlist += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; diff --git a/lib/cuda/cuda_pair.cu b/lib/cuda/cuda_pair.cu index 9f9900a2d8..28ab269e48 100644 --- a/lib/cuda/cuda_pair.cu +++ b/lib/cuda/cuda_pair.cu @@ -720,7 +720,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = CUT_CHECK_ERROR("Cuda_Pair: init failed"); } -timespec startpairtime, endpairtime; +my_times startpairtime, endpairtime; //Function which is called prior to kernel invocation, determins grid, Binds Textures, updates constant memory if necessary void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, dim3 &grid, dim3 &threads, int &sharedperproc, bool need_q = false, int maxthreads = 256) { @@ -785,7 +785,7 @@ void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlis if(sdata->pair.use_block_per_atom) sdata->pair.n_energy_virial -= 3; - clock_gettime(CLOCK_REALTIME, &startpairtime); + my_gettime(CLOCK_REALTIME, &startpairtime); MYDBG(printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);) } @@ -795,7 +795,7 @@ void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3 &grid, int &sh { if((not sdata->pair.collect_forces_later) && (eflag || vflag)) { //not sdata->comm.comm_phase==2)) cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &endpairtime); + my_gettime(CLOCK_REALTIME, &endpairtime); sdata->cuda_timings.pair_kernel += endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000; CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed"); @@ -986,7 +986,7 @@ void Cuda_Pair_BuildXHold(cuda_shared_data* sdata) void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag) { cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &endpairtime); + my_gettime(CLOCK_REALTIME, &endpairtime); sdata->cuda_timings.pair_kernel += endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000; CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed"); diff --git a/lib/cuda/cuda_precision.h b/lib/cuda/cuda_precision.h index 2dc4ab5607..7582c41de1 100644 --- a/lib/cuda/cuda_precision.h +++ b/lib/cuda/cuda_precision.h @@ -263,12 +263,15 @@ struct V_FLOAT4 { #endif #ifdef NO_PREC_TIMING -struct timespec_2 { +struct my_times { unsigned int tv_sec; unsigned int tv_nsec; }; -#define timespec timespec_2 -#define clock_gettime(a,b) +#define my_gettime(a,b) +#else +#define my_times timespec +#define my_gettime(a,b) clock_gettime(a,b) #endif + #endif /*CUDA_PRECISION_H_*/ diff --git a/lib/cuda/cuda_wrapper.cu b/lib/cuda/cuda_wrapper.cu index 50366a87da..c8bda6ecc3 100644 --- a/lib/cuda/cuda_wrapper.cu +++ b/lib/cuda/cuda_wrapper.cu @@ -175,10 +175,10 @@ void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes { MYDBG(printf("# CUDA: uploading %u bytes to device at dev%p from %p\n", nbytes, dev_data, host_data);) cudaThreadSynchronize(); - timespec time1, time2; - clock_gettime(CLOCK_REALTIME, &time1); + my_times time1, time2; + my_gettime(CLOCK_REALTIME, &time1); CUDA_SAFE_CALL(cudaMemcpy(dev_data, host_data, nbytes, cudaMemcpyHostToDevice)); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); CudaWrapper_total_upload_time += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; } @@ -193,10 +193,10 @@ void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbyt { MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);) cudaThreadSynchronize(); - timespec time1, time2; - clock_gettime(CLOCK_REALTIME, &time1); + my_times time1, time2; + my_gettime(CLOCK_REALTIME, &time1); CUDA_SAFE_CALL(cudaMemcpy(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost)); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); CudaWrapper_total_download_time += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; } diff --git a/lib/cuda/fix_nh_cuda.cu b/lib/cuda/fix_nh_cuda.cu index 1a9d84061f..cda10bd588 100644 --- a/lib/cuda/fix_nh_cuda.cu +++ b/lib/cuda/fix_nh_cuda.cu @@ -81,8 +81,8 @@ void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf) void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup see cpp { - timespec atime1, atime2; - clock_gettime(CLOCK_REALTIME, &atime1); + my_times atime1, atime2; + my_gettime(CLOCK_REALTIME, &atime1); if(sdata->atom.update_nmax) Cuda_FixNHCuda_UpdateNmax(sdata); @@ -90,7 +90,7 @@ void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* fa if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - clock_gettime(CLOCK_REALTIME, &atime2); + my_gettime(CLOCK_REALTIME, &atime2); sdata->cuda_timings.test1 += atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000; @@ -145,8 +145,8 @@ void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int gro void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp { - timespec atime1, atime2; - clock_gettime(CLOCK_REALTIME, &atime1); + my_times atime1, atime2; + my_gettime(CLOCK_REALTIME, &atime1); if(sdata->atom.update_nmax) Cuda_FixNHCuda_UpdateNmax(sdata); @@ -154,7 +154,7 @@ void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT fac if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - clock_gettime(CLOCK_REALTIME, &atime2); + my_gettime(CLOCK_REALTIME, &atime2); sdata->cuda_timings.test1 += atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000; @@ -171,8 +171,8 @@ void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT fac } void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp { - timespec atime1, atime2; - clock_gettime(CLOCK_REALTIME, &atime1); + my_times atime1, atime2; + my_gettime(CLOCK_REALTIME, &atime1); if(sdata->atom.update_nmax) Cuda_FixNHCuda_UpdateNmax(sdata); @@ -180,7 +180,7 @@ void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal) / if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - clock_gettime(CLOCK_REALTIME, &atime2); + my_gettime(CLOCK_REALTIME, &atime2); sdata->cuda_timings.test1 += atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000; @@ -198,8 +198,8 @@ void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal) / void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp { - timespec atime1, atime2; - clock_gettime(CLOCK_REALTIME, &atime1); + my_times atime1, atime2; + my_gettime(CLOCK_REALTIME, &atime1); if(sdata->atom.update_nmax) Cuda_FixNHCuda_UpdateNmax(sdata); @@ -207,7 +207,7 @@ void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal) / if(sdata->atom.update_nlocal) cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); - clock_gettime(CLOCK_REALTIME, &atime2); + my_gettime(CLOCK_REALTIME, &atime2); sdata->cuda_timings.test1 += atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000; diff --git a/lib/cuda/neighbor.cu b/lib/cuda/neighbor.cu index ddcf6ddc09..53af1e93f2 100644 --- a/lib/cuda/neighbor.cu +++ b/lib/cuda/neighbor.cu @@ -98,15 +98,15 @@ int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) dim3 threads(layout.z, 1, 1); dim3 grid(layout.x, layout.y, 1); - timespec starttime, endtime; - clock_gettime(CLOCK_REALTIME, &starttime); + my_times starttime, endtime; + my_gettime(CLOCK_REALTIME, &starttime); cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_FLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax)); Binning_Kernel <<< grid, threads>>> (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], sneighlist->bin_dim[2], rez_bin_size[0], rez_bin_size[1], rez_bin_size[2]); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); sdata->cuda_timings.neigh_bin += endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; @@ -228,8 +228,8 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_FLOAT)) * threads.x; MYDBG(printf("Configuration: %i %i %i %u %i\n", grid.x, grid.y, threads.x, shared_size, sneighlist->bin_nmax);) //shared_size=2056; - timespec starttime, endtime; - clock_gettime(CLOCK_REALTIME, &starttime); + my_times starttime, endtime; + my_gettime(CLOCK_REALTIME, &starttime); //for(int i=0;i<100;i++) { if(sdata->overlap_comm) @@ -250,7 +250,7 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed"); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); sdata->cuda_timings.neigh_build += endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; //dim3 threads,grid; @@ -258,7 +258,7 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn if(buffer[0] >= 0 && true && sdata->atom.molecular) { //printf("Find Special: %i %i\n",sneighlist->inum,sdata->atom.nall); - clock_gettime(CLOCK_REALTIME, &starttime); + my_gettime(CLOCK_REALTIME, &starttime); int3 layout = getgrid(sdata->atom.nlocal, 0, 512); threads.x = layout.z; threads.y = 1; @@ -269,7 +269,7 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn FindSpecial <<< grid, threads>>>(sdata->pair.use_block_per_atom); cudaThreadSynchronize(); CUT_CHECK_ERROR("Cuda_NeighborBuild: FindSpecial kernel execution failed"); - clock_gettime(CLOCK_REALTIME, &endtime); + my_gettime(CLOCK_REALTIME, &endtime); sdata->cuda_timings.neigh_special += endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; } diff --git a/lib/cuda/pair_sw_cuda.cu b/lib/cuda/pair_sw_cuda.cu index 491d4d666f..e96c558c68 100644 --- a/lib/cuda/pair_sw_cuda.cu +++ b/lib/cuda/pair_sw_cuda.cu @@ -100,16 +100,16 @@ void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, threads2.y = 1; threads2.z = 1; - timespec time1, time2; + my_times time1, time2; //pre-calculate all neighbordistances and zeta_ij - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); Pair_SW_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>>(); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.test1 += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); //actual force calculation unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure @@ -130,7 +130,7 @@ void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, (eflag_atom, vflag_atom); } cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.test2 += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; diff --git a/lib/cuda/pair_tersoff_cuda.cu b/lib/cuda/pair_tersoff_cuda.cu index 0ae5e846a0..e138c62b57 100644 --- a/lib/cuda/pair_tersoff_cuda.cu +++ b/lib/cuda/pair_tersoff_cuda.cu @@ -111,20 +111,20 @@ void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneigh threads2.y = 1; threads2.z = 1; - timespec time1, time2; + my_times time1, time2; //pre-calculate all neighbordistances and zeta_ij - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); Pair_Tersoff_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>> (); cudaThreadSynchronize(); Pair_Tersoff_Kernel_TpA_ZetaIJ <<< grid2, threads2, 0, streams[1]>>> (); cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.test1 += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; - clock_gettime(CLOCK_REALTIME, &time1); + my_gettime(CLOCK_REALTIME, &time1); //actual force calculation unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure @@ -145,7 +145,7 @@ void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneigh (eflag_atom, vflag_atom); } cudaThreadSynchronize(); - clock_gettime(CLOCK_REALTIME, &time2); + my_gettime(CLOCK_REALTIME, &time2); sdata->cuda_timings.test2 += time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;