forked from lijiext/lammps
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@10519 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
bd568f0df4
commit
9adfd0cac1
|
@ -134,7 +134,7 @@ template <const unsigned int data_mask>
|
|||
int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
|
||||
timespec time1, time2;
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
|
||||
|
@ -171,14 +171,14 @@ int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* b
|
|||
if(sdata->atom.nlocal > 0) {
|
||||
cudaMemset(sdata->flag, 0, sizeof(int));
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
|
||||
Cuda_AtomVecCuda_PackComm_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
|
||||
, sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_kernel_pack +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -188,7 +188,7 @@ int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* b
|
|||
cudaMemcpy(buf_send, sdata->buffer, n* n_data_items* sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
|
||||
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_forward_download +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -207,7 +207,7 @@ template <const unsigned int data_mask>
|
|||
int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
|
||||
{
|
||||
MYDBG(printf(" # CUDA: AtomVecCuda_PackComm_Self\n");)
|
||||
timespec time1, time2;
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
|
||||
|
@ -247,13 +247,13 @@ int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, in
|
|||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self:Pre Kernel execution failed");
|
||||
|
||||
Cuda_AtomVecCuda_PackComm_Self_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_kernel_self +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -267,7 +267,7 @@ int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, in
|
|||
template <const unsigned int data_mask>
|
||||
void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
|
||||
{
|
||||
timespec time1, time2;
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
|
||||
|
@ -286,19 +286,19 @@ void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void
|
|||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
if(not sdata->overlap_comm || iswap < 0)
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, n_data_items * n * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_upload +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
|
||||
Cuda_AtomVecCuda_UnpackComm_Kernel<data_mask> <<< grid, threads, 0>>>(n, first, buf);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_forward_kernel_unpack +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -325,14 +325,14 @@ int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, v
|
|||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
timespec time1, time2;
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_times time1, time2;
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
Cuda_AtomVecCuda_PackExchangeList_Kernel <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (n - 1, dim);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed");
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_exchange_kernel_pack +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -344,7 +344,7 @@ int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, v
|
|||
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: return copy failed");
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_exchange_download +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -375,20 +375,20 @@ int Cuda_AtomVecCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_
|
|||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
timespec time1, time2;
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_times time1, time2;
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
Cuda_AtomVecCuda_PackExchange_Kernel<data_mask> <<< grid, threads, 0>>>(nsend, (int*) copylist);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchange: Kernel execution failed");
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_exchange_kernel_pack +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_exchange_download +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -419,19 +419,19 @@ int Cuda_AtomVecCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* bu
|
|||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
timespec time1, time2;
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_times time1, time2;
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
cudaMemcpy(sdata->buffer, buf_send , size, cudaMemcpyHostToDevice);
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_exchange_upload +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
Cuda_AtomVecCuda_UnpackExchange_Kernel<data_mask> <<< grid, threads, 0>>>(sdata->exchange_dim, nsend, (int*) copylist);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_exchange_kernel_unpack +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -448,8 +448,8 @@ int Cuda_AtomVecCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* bu
|
|||
template <const unsigned int data_mask>
|
||||
int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
timespec atime1, atime2;
|
||||
clock_gettime(CLOCK_REALTIME, &atime1);
|
||||
my_times atime1, atime2;
|
||||
my_gettime(CLOCK_REALTIME, &atime1);
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
|
||||
|
@ -457,7 +457,7 @@ int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, v
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &atime2);
|
||||
my_gettime(CLOCK_REALTIME, &atime2);
|
||||
sdata->cuda_timings.test1 +=
|
||||
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -489,20 +489,20 @@ int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, v
|
|||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
timespec time1, time2;
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_times time1, time2;
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
Cuda_AtomVecCuda_PackBorder_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, nsend, sdata->comm.maxlistlength, iswap, dx, dy, dz);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_border_kernel_pack +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
|
||||
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder: Kernel execution failed");
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_border_download +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -548,13 +548,13 @@ int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap,
|
|||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
timespec time1, time2;
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_times time1, time2;
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
Cuda_AtomVecCuda_PackBorder_Self_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_border_kernel_self +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -569,8 +569,8 @@ int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap,
|
|||
template <const unsigned int data_mask>
|
||||
int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
|
||||
{
|
||||
timespec atime1, atime2;
|
||||
clock_gettime(CLOCK_REALTIME, &atime1);
|
||||
my_times atime1, atime2;
|
||||
my_gettime(CLOCK_REALTIME, &atime1);
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
|
||||
|
@ -578,7 +578,7 @@ int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, voi
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &atime2);
|
||||
my_gettime(CLOCK_REALTIME, &atime2);
|
||||
sdata->cuda_timings.test1 +=
|
||||
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -594,20 +594,20 @@ int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, voi
|
|||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
timespec time1, time2;
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_times time1, time2;
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
cudaMemset((int*)(sdata->flag), 0, sizeof(int));
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, size, cudaMemcpyHostToDevice);
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_border_upload +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
Cuda_AtomVecCuda_UnpackBorder_Kernel<data_mask> <<< grid, threads, 0>>>(n, first);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_border_kernel_unpack +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
|
|
|
@ -73,7 +73,7 @@ void Cuda_CommCuda_Init(cuda_shared_data* sdata)
|
|||
int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
|
||||
timespec time1, time2;
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
@ -109,14 +109,14 @@ int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_
|
|||
if(sdata->atom.nlocal > 0) {
|
||||
cudaMemset(sdata->flag, 0, sizeof(int));
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
|
||||
Cuda_CommCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
|
||||
, sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_kernel_pack +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -126,7 +126,7 @@ int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_
|
|||
cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
|
||||
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_forward_download +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -143,7 +143,7 @@ int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_
|
|||
int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
|
||||
{
|
||||
|
||||
timespec time1, time2;
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
@ -179,14 +179,14 @@ int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* b
|
|||
if(sdata->atom.nlocal > 0) {
|
||||
cudaMemset(sdata->flag, 0, sizeof(int));
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
|
||||
Cuda_CommCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
|
||||
, sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_kernel_pack +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -196,7 +196,7 @@ int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* b
|
|||
cudaMemcpy(buf_send, sdata->buffer, n * 6 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
|
||||
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_forward_download +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -213,7 +213,7 @@ int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* b
|
|||
int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
|
||||
{
|
||||
MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
|
||||
timespec time1, time2;
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
@ -252,12 +252,12 @@ int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int f
|
|||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
Cuda_CommCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_kernel_self +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -270,7 +270,7 @@ int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int f
|
|||
int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
|
||||
{
|
||||
MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
|
||||
timespec time1, time2;
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
@ -309,12 +309,12 @@ int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, in
|
|||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
Cuda_CommCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_kernel_self +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -326,7 +326,7 @@ int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, in
|
|||
|
||||
void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
|
||||
{
|
||||
timespec time1, time2;
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
@ -344,19 +344,19 @@ void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* b
|
|||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
if(not sdata->overlap_comm || iswap < 0)
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_upload +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
|
||||
Cuda_CommCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_forward_kernel_unpack +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -367,7 +367,7 @@ void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* b
|
|||
|
||||
void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
|
||||
{
|
||||
timespec time1, time2;
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
@ -385,19 +385,19 @@ void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void
|
|||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
if(sdata->atom.nlocal > 0) {
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
if(not sdata->overlap_comm || iswap < 0)
|
||||
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 6 * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_forward_upload +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
|
||||
Cuda_CommCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
sdata->cuda_timings.comm_forward_kernel_unpack +=
|
||||
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -489,7 +489,7 @@ void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata, int n, int iswap,
|
|||
int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int ineed, int style, int atom_nfirst, int nfirst, int nlast, int dim, int iswap)
|
||||
{
|
||||
MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");)
|
||||
timespec time1, time2;
|
||||
my_times time1, time2;
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_CommCuda_UpdateNmax(sdata);
|
||||
|
@ -517,7 +517,7 @@ int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int in
|
|||
|
||||
cudaMemset((int*)(sdata->buffer), 0, sizeof(int));
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
if(style == 1)
|
||||
Cuda_CommCuda_BuildSendlist_Single <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_FLOAT*) sdata->comm.slablo.dev_data, (X_FLOAT*) sdata->comm.slabhi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
|
||||
|
@ -525,7 +525,7 @@ int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int in
|
|||
Cuda_CommCuda_BuildSendlist_Multi <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_FLOAT*) sdata->comm.multilo.dev_data, (X_FLOAT*) sdata->comm.multihi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
|
||||
|
||||
cudaThreadSynchronize();
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.comm_border_kernel_buildlist +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
|
|
|
@ -720,7 +720,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
|
|||
|
||||
CUT_CHECK_ERROR("Cuda_Pair: init failed");
|
||||
}
|
||||
timespec startpairtime, endpairtime;
|
||||
my_times startpairtime, endpairtime;
|
||||
//Function which is called prior to kernel invocation, determins grid, Binds Textures, updates constant memory if necessary
|
||||
void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, dim3 &grid, dim3 &threads, int &sharedperproc, bool need_q = false, int maxthreads = 256)
|
||||
{
|
||||
|
@ -785,7 +785,7 @@ void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlis
|
|||
|
||||
if(sdata->pair.use_block_per_atom) sdata->pair.n_energy_virial -= 3;
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &startpairtime);
|
||||
my_gettime(CLOCK_REALTIME, &startpairtime);
|
||||
|
||||
MYDBG(printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);)
|
||||
}
|
||||
|
@ -795,7 +795,7 @@ void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3 &grid, int &sh
|
|||
{
|
||||
if((not sdata->pair.collect_forces_later) && (eflag || vflag)) { //not sdata->comm.comm_phase==2))
|
||||
cudaThreadSynchronize();
|
||||
clock_gettime(CLOCK_REALTIME, &endpairtime);
|
||||
my_gettime(CLOCK_REALTIME, &endpairtime);
|
||||
sdata->cuda_timings.pair_kernel +=
|
||||
endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000;
|
||||
CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed");
|
||||
|
@ -986,7 +986,7 @@ void Cuda_Pair_BuildXHold(cuda_shared_data* sdata)
|
|||
void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag)
|
||||
{
|
||||
cudaThreadSynchronize();
|
||||
clock_gettime(CLOCK_REALTIME, &endpairtime);
|
||||
my_gettime(CLOCK_REALTIME, &endpairtime);
|
||||
sdata->cuda_timings.pair_kernel +=
|
||||
endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000;
|
||||
CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed");
|
||||
|
|
|
@ -263,12 +263,15 @@ struct V_FLOAT4 {
|
|||
#endif
|
||||
|
||||
#ifdef NO_PREC_TIMING
|
||||
struct timespec_2 {
|
||||
struct my_times {
|
||||
unsigned int tv_sec;
|
||||
unsigned int tv_nsec;
|
||||
};
|
||||
|
||||
#define timespec timespec_2
|
||||
#define clock_gettime(a,b)
|
||||
#define my_gettime(a,b)
|
||||
#else
|
||||
#define my_times timespec
|
||||
#define my_gettime(a,b) clock_gettime(a,b)
|
||||
#endif
|
||||
|
||||
#endif /*CUDA_PRECISION_H_*/
|
||||
|
|
|
@ -175,10 +175,10 @@ void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes
|
|||
{
|
||||
MYDBG(printf("# CUDA: uploading %u bytes to device at dev%p from %p\n", nbytes, dev_data, host_data);)
|
||||
cudaThreadSynchronize();
|
||||
timespec time1, time2;
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_times time1, time2;
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
CUDA_SAFE_CALL(cudaMemcpy(dev_data, host_data, nbytes, cudaMemcpyHostToDevice));
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
CudaWrapper_total_upload_time +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
}
|
||||
|
@ -193,10 +193,10 @@ void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbyt
|
|||
{
|
||||
MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
|
||||
cudaThreadSynchronize();
|
||||
timespec time1, time2;
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_times time1, time2;
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
CUDA_SAFE_CALL(cudaMemcpy(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost));
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
CudaWrapper_total_download_time +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
}
|
||||
|
|
|
@ -81,8 +81,8 @@ void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf)
|
|||
|
||||
void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
{
|
||||
timespec atime1, atime2;
|
||||
clock_gettime(CLOCK_REALTIME, &atime1);
|
||||
my_times atime1, atime2;
|
||||
my_gettime(CLOCK_REALTIME, &atime1);
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixNHCuda_UpdateNmax(sdata);
|
||||
|
@ -90,7 +90,7 @@ void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* fa
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &atime2);
|
||||
my_gettime(CLOCK_REALTIME, &atime2);
|
||||
sdata->cuda_timings.test1 +=
|
||||
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -145,8 +145,8 @@ void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int gro
|
|||
|
||||
void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
{
|
||||
timespec atime1, atime2;
|
||||
clock_gettime(CLOCK_REALTIME, &atime1);
|
||||
my_times atime1, atime2;
|
||||
my_gettime(CLOCK_REALTIME, &atime1);
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixNHCuda_UpdateNmax(sdata);
|
||||
|
@ -154,7 +154,7 @@ void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT fac
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &atime2);
|
||||
my_gettime(CLOCK_REALTIME, &atime2);
|
||||
sdata->cuda_timings.test1 +=
|
||||
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -171,8 +171,8 @@ void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT fac
|
|||
}
|
||||
void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
{
|
||||
timespec atime1, atime2;
|
||||
clock_gettime(CLOCK_REALTIME, &atime1);
|
||||
my_times atime1, atime2;
|
||||
my_gettime(CLOCK_REALTIME, &atime1);
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixNHCuda_UpdateNmax(sdata);
|
||||
|
@ -180,7 +180,7 @@ void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal) /
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &atime2);
|
||||
my_gettime(CLOCK_REALTIME, &atime2);
|
||||
sdata->cuda_timings.test1 +=
|
||||
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -198,8 +198,8 @@ void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal) /
|
|||
|
||||
void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp
|
||||
{
|
||||
timespec atime1, atime2;
|
||||
clock_gettime(CLOCK_REALTIME, &atime1);
|
||||
my_times atime1, atime2;
|
||||
my_gettime(CLOCK_REALTIME, &atime1);
|
||||
|
||||
if(sdata->atom.update_nmax)
|
||||
Cuda_FixNHCuda_UpdateNmax(sdata);
|
||||
|
@ -207,7 +207,7 @@ void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal) /
|
|||
if(sdata->atom.update_nlocal)
|
||||
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &atime2);
|
||||
my_gettime(CLOCK_REALTIME, &atime2);
|
||||
sdata->cuda_timings.test1 +=
|
||||
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
|
||||
|
||||
|
|
|
@ -98,15 +98,15 @@ int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
|
|||
dim3 threads(layout.z, 1, 1);
|
||||
dim3 grid(layout.x, layout.y, 1);
|
||||
|
||||
timespec starttime, endtime;
|
||||
clock_gettime(CLOCK_REALTIME, &starttime);
|
||||
my_times starttime, endtime;
|
||||
my_gettime(CLOCK_REALTIME, &starttime);
|
||||
|
||||
cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_FLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax));
|
||||
|
||||
Binning_Kernel <<< grid, threads>>> (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], sneighlist->bin_dim[2], rez_bin_size[0], rez_bin_size[1], rez_bin_size[2]);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &endtime);
|
||||
my_gettime(CLOCK_REALTIME, &endtime);
|
||||
sdata->cuda_timings.neigh_bin +=
|
||||
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
|
||||
|
||||
|
@ -228,8 +228,8 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
|
|||
unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_FLOAT)) * threads.x;
|
||||
MYDBG(printf("Configuration: %i %i %i %u %i\n", grid.x, grid.y, threads.x, shared_size, sneighlist->bin_nmax);)
|
||||
//shared_size=2056;
|
||||
timespec starttime, endtime;
|
||||
clock_gettime(CLOCK_REALTIME, &starttime);
|
||||
my_times starttime, endtime;
|
||||
my_gettime(CLOCK_REALTIME, &starttime);
|
||||
//for(int i=0;i<100;i++)
|
||||
{
|
||||
if(sdata->overlap_comm)
|
||||
|
@ -250,7 +250,7 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
|
|||
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
|
||||
clock_gettime(CLOCK_REALTIME, &endtime);
|
||||
my_gettime(CLOCK_REALTIME, &endtime);
|
||||
sdata->cuda_timings.neigh_build +=
|
||||
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
|
||||
//dim3 threads,grid;
|
||||
|
@ -258,7 +258,7 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
|
|||
|
||||
if(buffer[0] >= 0 && true && sdata->atom.molecular) {
|
||||
//printf("Find Special: %i %i\n",sneighlist->inum,sdata->atom.nall);
|
||||
clock_gettime(CLOCK_REALTIME, &starttime);
|
||||
my_gettime(CLOCK_REALTIME, &starttime);
|
||||
int3 layout = getgrid(sdata->atom.nlocal, 0, 512);
|
||||
threads.x = layout.z;
|
||||
threads.y = 1;
|
||||
|
@ -269,7 +269,7 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
|
|||
FindSpecial <<< grid, threads>>>(sdata->pair.use_block_per_atom);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Cuda_NeighborBuild: FindSpecial kernel execution failed");
|
||||
clock_gettime(CLOCK_REALTIME, &endtime);
|
||||
my_gettime(CLOCK_REALTIME, &endtime);
|
||||
sdata->cuda_timings.neigh_special +=
|
||||
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
|
||||
}
|
||||
|
|
|
@ -100,16 +100,16 @@ void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist,
|
|||
threads2.y = 1;
|
||||
threads2.z = 1;
|
||||
|
||||
timespec time1, time2;
|
||||
my_times time1, time2;
|
||||
|
||||
//pre-calculate all neighbordistances and zeta_ij
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
Pair_SW_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>>();
|
||||
cudaThreadSynchronize();
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.test1 +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
//actual force calculation
|
||||
unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure
|
||||
|
@ -130,7 +130,7 @@ void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist,
|
|||
(eflag_atom, vflag_atom);
|
||||
}
|
||||
cudaThreadSynchronize();
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.test2 +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
|
|
|
@ -111,20 +111,20 @@ void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneigh
|
|||
threads2.y = 1;
|
||||
threads2.z = 1;
|
||||
|
||||
timespec time1, time2;
|
||||
my_times time1, time2;
|
||||
|
||||
//pre-calculate all neighbordistances and zeta_ij
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
Pair_Tersoff_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>>
|
||||
();
|
||||
cudaThreadSynchronize();
|
||||
Pair_Tersoff_Kernel_TpA_ZetaIJ <<< grid2, threads2, 0, streams[1]>>>
|
||||
();
|
||||
cudaThreadSynchronize();
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.test1 +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
clock_gettime(CLOCK_REALTIME, &time1);
|
||||
my_gettime(CLOCK_REALTIME, &time1);
|
||||
|
||||
//actual force calculation
|
||||
unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure
|
||||
|
@ -145,7 +145,7 @@ void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneigh
|
|||
(eflag_atom, vflag_atom);
|
||||
}
|
||||
cudaThreadSynchronize();
|
||||
clock_gettime(CLOCK_REALTIME, &time2);
|
||||
my_gettime(CLOCK_REALTIME, &time2);
|
||||
sdata->cuda_timings.test2 +=
|
||||
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
|
||||
|
||||
|
|
Loading…
Reference in New Issue