git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@10520 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2013-08-02 15:03:02 +00:00
parent 9adfd0cac1
commit 41bf441838
9 changed files with 164 additions and 164 deletions

View File

@ -330,8 +330,8 @@ int AtomVecAngleCuda::pack_exchange(int dim, double *buf)
int m = Cuda_AtomVecAngleCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data());
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_times time1,time2;
my_gettime(CLOCK_REALTIME,&time1);
double* buf_p=*buf_pointer;
for(int j=0;j<nsend_atoms;j++)
@ -378,7 +378,7 @@ int AtomVecAngleCuda::pack_exchange(int dim, double *buf)
(*buf_pointer)[j+1] = nextra;
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
@ -423,8 +423,8 @@ int AtomVecAngleCuda::unpack_exchange(double *buf)
int m = nsend_atoms*NCUDAEXCHANGE + 1;
nlocal+=naccept;
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_times time1,time2;
my_gettime(CLOCK_REALTIME,&time1);
for(int j=0;j<nsend_atoms;j++)
{
@ -462,7 +462,7 @@ int AtomVecAngleCuda::unpack_exchange(double *buf)
m+=static_cast <int> (buf[j+1]);
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;

View File

@ -331,8 +331,8 @@ int AtomVecFullCuda::pack_exchange(int dim, double *buf)
int m = Cuda_AtomVecFullCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data());
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_times time1,time2;
my_gettime(CLOCK_REALTIME,&time1);
double* buf_p=*buf_pointer;
for(int j=0;j<nsend_atoms;j++)
@ -401,7 +401,7 @@ int AtomVecFullCuda::pack_exchange(int dim, double *buf)
(*buf_pointer)[j+1] = nextra;
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
@ -446,8 +446,8 @@ int AtomVecFullCuda::unpack_exchange(double *buf)
int m = nsend_atoms*NCUDAEXCHANGE + 1;
nlocal+=naccept;
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_times time1,time2;
my_gettime(CLOCK_REALTIME,&time1);
for(int j=0;j<nsend_atoms;j++)
{
@ -503,7 +503,7 @@ int AtomVecFullCuda::unpack_exchange(double *buf)
m+=static_cast <int> (buf[j+1]);
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;

View File

@ -175,7 +175,7 @@ void CommCuda::forward_comm_cuda()
static int count=0;
static double kerneltime=0.0;
static double copytime=0.0;
timespec time1,time2,time3;
my_times time1,time2,time3;
int n;
MPI_Request request;
@ -214,13 +214,13 @@ void CommCuda::forward_comm_cuda()
size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_FLOAT)/sizeof(double);
else
size_forward_recv_now=size_forward_recv[iswap];
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
MPI_Irecv(buf_recv,size_forward_recv_now,MPI_DOUBLE,
recvproc[iswap],0,world,&request);
n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*) buf_send,pbc[iswap],pbc_flag[iswap]);
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
@ -229,7 +229,7 @@ clock_gettime(CLOCK_REALTIME,&time2);
MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
MPI_Wait(&request,&status);
clock_gettime(CLOCK_REALTIME,&time3);
my_gettime(CLOCK_REALTIME,&time3);
cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000;
cuda->shared_data.cuda_timings.comm_forward_mpi_lower+=
@ -307,7 +307,7 @@ void CommCuda::forward_comm_pack_cuda()
static int count=0;
static double kerneltime=0.0;
static double copytime=0.0;
timespec time1,time2,time3;
my_times time1,time2,time3;
int n; // initialize comm buffers & exchange memory
MPI_Request request;
@ -335,12 +335,12 @@ void CommCuda::forward_comm_pack_cuda()
{
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
// n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*) cuda->shared_data.comm.buf_send[iswap],pbc[iswap],pbc_flag[iswap]);
n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*)buf_send,pbc[iswap],pbc_flag[iswap]);
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
@ -348,11 +348,11 @@ clock_gettime(CLOCK_REALTIME,&time2);
}
else if (ghost_velocity)
{
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
// n = Cuda_CommCuda_PackComm_Vel(&cuda->shared_data,sendnum[iswap],iswap,(void*) &buf_send[iswap*maxsend],pbc[iswap],pbc_flag[iswap]);
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
if((sizeof(X_FLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
n=(n+1)*sizeof(X_FLOAT)/sizeof(double);
@ -410,7 +410,7 @@ void CommCuda::forward_comm_transfer_cuda()
static int count=0;
static double kerneltime=0.0;
static double copytime=0.0;
timespec time1,time2,time3;
my_times time1,time2,time3;
int n;
MPI_Request request;
MPI_Status status;
@ -453,27 +453,27 @@ void CommCuda::forward_comm_transfer_cuda()
//printf("B: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
CudaWrapper_DownloadCudaDataAsync((void*) buf_send, cuda->shared_data.comm.buf_send_dev[iswap], cuda->shared_data.comm.send_size[iswap]*sizeof(double),2);
//MPI_Send(cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
CudaWrapper_SyncStream(2);
//printf("C: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
cuda->shared_data.cuda_timings.comm_forward_download+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
MPI_Send(buf_send,cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
MPI_Wait(&request,&status);
//printf("D: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
CudaWrapper_UploadCudaDataAsync((void*) buf_recv,cuda->shared_data.comm.buf_recv_dev[iswap], size_forward_recv_now*sizeof(double),2);
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
CudaWrapper_SyncStream(2);
//printf("E: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
//memcpy(cuda->shared_data.comm.buf_recv[iswap],buf_recv,size_forward_recv_now*sizeof(double));
//printf("RecvSize: %i SendSize: %i\n",size_forward_recv_now*sizeof(double),cuda->shared_data.comm.send_size[iswap]*sizeof(double));
clock_gettime(CLOCK_REALTIME,&time3);
my_gettime(CLOCK_REALTIME,&time3);
cuda->shared_data.cuda_timings.comm_forward_upload+=
time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000;
cuda->shared_data.cuda_timings.comm_forward_mpi_lower+=
time3.tv_sec-time2.tv_sec+1.0*(time3.tv_nsec-time2.tv_nsec)/1000000000;
clock_gettime(CLOCK_REALTIME,&time3);
my_gettime(CLOCK_REALTIME,&time3);
cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000;
}
@ -486,17 +486,17 @@ cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
else
size_forward_recv_now=size_forward_recv[iswap];
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
MPI_Irecv(cuda->shared_data.comm.buf_recv[iswap],size_forward_recv_now,MPI_DOUBLE,
recvproc[iswap],0,world,&request);
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
MPI_Send(cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
MPI_Wait(&request,&status);
clock_gettime(CLOCK_REALTIME,&time3);
my_gettime(CLOCK_REALTIME,&time3);
cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000;
cuda->shared_data.cuda_timings.comm_forward_mpi_lower+=
@ -548,7 +548,7 @@ void CommCuda::forward_comm_unpack_cuda()
static int count=0;
static double kerneltime=0.0;
static double copytime=0.0;
timespec time1,time2,time3;
my_times time1,time2,time3;
int n;
MPI_Request request;
MPI_Status status;
@ -762,7 +762,7 @@ void CommCuda::exchange_cuda()
MPI_Request request;
MPI_Status status;
AtomVec *avec = atom->avec;
timespec time1,time2,time3;
my_times time1,time2,time3;
// clear global->local map for owned and ghost atoms
// b/c atoms migrate to new procs in exchange() and
@ -805,7 +805,7 @@ void CommCuda::exchange_cuda()
// if 2 procs in dimension, single send/recv
// if more than 2 procs in dimension, send/recv to both neighbors
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
if (procgrid[dim] == 1) {
nrecv = nsend;
@ -841,7 +841,7 @@ void CommCuda::exchange_cuda()
//printf("nsend: %i nrecv: %i\n",nsend,nrecv);
// check incoming atoms to see if they are in my box
// if so, add to my list
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
cuda->shared_data.cuda_timings.comm_exchange_mpi+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
@ -902,7 +902,7 @@ void CommCuda::borders_cuda()
MPI_Request request;
MPI_Status status;
AtomVec *avec = atom->avec;
timespec time1,time2,time3;
my_times time1,time2,time3;
// clear old ghosts
@ -966,7 +966,7 @@ void CommCuda::borders_cuda()
// put incoming ghosts at end of my atom arrays
// if swapping with self, simply copy, no messages
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
if (sendproc[iswap] != me) {
MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0,
&nrecv,1,MPI_INT,recvproc[iswap],0,world,&status);
@ -982,7 +982,7 @@ clock_gettime(CLOCK_REALTIME,&time1);
buf = buf_send;
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
cuda->shared_data.cuda_timings.comm_border_mpi+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
@ -1037,7 +1037,7 @@ void CommCuda::borders_cuda_overlap_forward_comm()
MPI_Request request;
MPI_Status status;
AtomVec *avec = atom->avec;
timespec time1,time2,time3;
my_times time1,time2,time3;
// clear old ghosts
@ -1102,7 +1102,7 @@ void CommCuda::borders_cuda_overlap_forward_comm()
// put incoming ghosts at end of my atom arrays
// if swapping with self, simply copy, no messages
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
if (sendproc[iswap] != me) {
MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0,
&nrecv,1,MPI_INT,recvproc[iswap],0,world,&status);
@ -1118,7 +1118,7 @@ clock_gettime(CLOCK_REALTIME,&time1);
buf = buf_send;
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
cuda->shared_data.cuda_timings.comm_border_mpi+=
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;

View File

@ -629,12 +629,12 @@ void Cuda::evsetup_eatom_vatom(int eflag_atom, int vflag_atom)
void Cuda::uploadAll()
{
MYDBG(printf("# CUDA: Cuda::uploadAll() ... start\n");)
timespec starttime;
timespec endtime;
my_times starttime;
my_times endtime;
if(atom->nmax != shared_data.atom.nmax) checkResize();
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
cu_x ->upload();
cu_v ->upload();
cu_f ->upload();
@ -663,7 +663,7 @@ void Cuda::uploadAll()
if(cu_vatom) cu_vatom->upload();
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
uploadtime += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
CUDA_IF_BINNING(Cuda_PreBinning(& shared_data);)
CUDA_IF_BINNING(Cuda_Binning(& shared_data);)
@ -675,13 +675,13 @@ void Cuda::uploadAll()
void Cuda::downloadAll()
{
MYDBG(printf("# CUDA: Cuda::downloadAll() ... start\n");)
timespec starttime;
timespec endtime;
my_times starttime;
my_times endtime;
if(atom->nmax != shared_data.atom.nmax) checkResize();
CUDA_IF_BINNING(Cuda_ReverseBinning(& shared_data);)
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
cu_x ->download();
cu_v ->download();
cu_f ->download();
@ -713,7 +713,7 @@ void Cuda::downloadAll()
if(cu_vatom) cu_vatom->download();
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
downloadtime += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
MYDBG(printf("# CUDA: Cuda::downloadAll() ... end\n");)
}
@ -721,12 +721,12 @@ void Cuda::downloadAll()
void Cuda::upload(int datamask)
{
MYDBG(printf("# CUDA: Cuda::upload() ... start\n");)
timespec starttime;
timespec endtime;
my_times starttime;
my_times endtime;
if(atom->nmax != shared_data.atom.nmax) checkResize();
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
if(X_MASK & datamask) cu_x ->upload();
if(V_MASK & datamask) cu_v ->upload();
if(F_MASK & datamask) cu_f ->upload();
@ -766,7 +766,7 @@ void Cuda::upload(int datamask)
if(cu_vatom) cu_vatom->upload();
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
uploadtime += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
MYDBG(printf("# CUDA: Cuda::upload() ... end\n");)
}
@ -774,13 +774,13 @@ void Cuda::upload(int datamask)
void Cuda::download(int datamask)
{
MYDBG(printf("# CUDA: Cuda::download() ... start\n");)
timespec starttime;
timespec endtime;
my_times starttime;
my_times endtime;
if(atom->nmax != shared_data.atom.nmax) checkResize();
CUDA_IF_BINNING(Cuda_ReverseBinning(& shared_data);)
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
if(X_MASK & datamask) cu_x ->download();
if(V_MASK & datamask) cu_v ->download();
if(F_MASK & datamask) cu_f ->download();
@ -820,7 +820,7 @@ void Cuda::download(int datamask)
if(cu_vatom) cu_vatom->download();
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
downloadtime += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
MYDBG(printf("# CUDA: Cuda::download() ... end\n");)
}

View File

@ -281,9 +281,9 @@ void cCudaData<host_type, dev_type, mode>
else
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
@ -298,9 +298,9 @@ void cCudaData<host_type, dev_type, mode>
else
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
@ -311,7 +311,7 @@ void cCudaData<host_type, dev_type, mode>
case xy:
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
{
dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
@ -320,7 +320,7 @@ void cCudaData<host_type, dev_type, mode>
temp[j] = static_cast<dev_type>((reinterpret_cast<host_type**>(host_data))[i][j]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
@ -330,7 +330,7 @@ void cCudaData<host_type, dev_type, mode>
case yx:
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
@ -339,7 +339,7 @@ void cCudaData<host_type, dev_type, mode>
temp[i] = static_cast<dev_type>(reinterpret_cast<host_type**>(host_data)[i][j]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
@ -348,7 +348,7 @@ void cCudaData<host_type, dev_type, mode>
case xyz:
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
@ -358,7 +358,7 @@ void cCudaData<host_type, dev_type, mode>
temp[k] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
@ -368,7 +368,7 @@ void cCudaData<host_type, dev_type, mode>
case xzy:
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
{
@ -378,7 +378,7 @@ void cCudaData<host_type, dev_type, mode>
temp[j] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
@ -459,9 +459,9 @@ void cCudaData<host_type, dev_type, mode>
else
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
@ -476,9 +476,9 @@ void cCudaData<host_type, dev_type, mode>
else
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
@ -489,7 +489,7 @@ void cCudaData<host_type, dev_type, mode>
case xy:
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
{
dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
@ -498,7 +498,7 @@ void cCudaData<host_type, dev_type, mode>
temp[j] = static_cast<dev_type>((reinterpret_cast<host_type**>(host_data))[i][j]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
@ -508,7 +508,7 @@ void cCudaData<host_type, dev_type, mode>
case yx:
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
@ -517,7 +517,7 @@ void cCudaData<host_type, dev_type, mode>
temp[i] = static_cast<dev_type>(reinterpret_cast<host_type**>(host_data)[i][j]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
@ -526,7 +526,7 @@ void cCudaData<host_type, dev_type, mode>
case xyz:
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
@ -536,7 +536,7 @@ void cCudaData<host_type, dev_type, mode>
temp[k] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
@ -546,7 +546,7 @@ void cCudaData<host_type, dev_type, mode>
case xzy:
{
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
{
@ -556,7 +556,7 @@ void cCudaData<host_type, dev_type, mode>
temp[j] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufUploadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
@ -585,9 +585,9 @@ void cCudaData<host_type, dev_type, mode>
{
CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufDownloadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
}
@ -602,9 +602,9 @@ void cCudaData<host_type, dev_type, mode>
{
CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufDownloadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
}
@ -615,7 +615,7 @@ void cCudaData<host_type, dev_type, mode>
{
CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
{
dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
@ -624,7 +624,7 @@ void cCudaData<host_type, dev_type, mode>
reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[j]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufDownloadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
break;
@ -634,7 +634,7 @@ void cCudaData<host_type, dev_type, mode>
{
CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
@ -643,7 +643,7 @@ void cCudaData<host_type, dev_type, mode>
reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[i]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufDownloadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
break;
@ -653,7 +653,7 @@ void cCudaData<host_type, dev_type, mode>
{
CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
{
@ -663,7 +663,7 @@ void cCudaData<host_type, dev_type, mode>
reinterpret_cast<host_type***>(host_data)[i][j][k] = static_cast<host_type>(temp[k]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufDownloadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
break;
@ -673,7 +673,7 @@ void cCudaData<host_type, dev_type, mode>
{
CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
timespec time1,time2;
clock_gettime(CLOCK_REALTIME,&time1);
my_gettime(CLOCK_REALTIME,&time1);
for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
{
@ -683,7 +683,7 @@ void cCudaData<host_type, dev_type, mode>
reinterpret_cast<host_type***>(host_data)[i][j][k] = static_cast<host_type>(temp[j]);
}
}
clock_gettime(CLOCK_REALTIME,&time2);
my_gettime(CLOCK_REALTIME,&time2);
CudaWrapper_AddCPUBufDownloadTime(
time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
break;

View File

@ -88,8 +88,8 @@ void fft_3d_cuda(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan
{
#ifdef FFT_CUFFT
plan->iterate++;
timespec starttime,starttime2;
timespec endtime,endtime2;
my_times starttime,starttime2;
my_times endtime,endtime2;
int i,total,length,offset,num;
double norm;
@ -103,7 +103,7 @@ void fft_3d_cuda(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan
if(nprocs>1)
{
if(plan->init)
clock_gettime(CLOCK_REALTIME,&starttime);
my_gettime(CLOCK_REALTIME,&starttime);
if (plan->pre_plan) {
if (plan->pre_target == 0) copy = out;
else copy = plan->copy;

View File

@ -670,8 +670,8 @@ void FixShakeCuda::pre_neighbor()
void FixShakeCuda::post_force(int vflag)
{
timespec starttime;
timespec endtime;
my_times starttime;
my_times endtime;
if(cuda->finished_setup && neighbor_step) {
@ -715,7 +715,7 @@ void FixShakeCuda::post_force(int vflag)
// loop over clusters
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
if(cuda->finished_setup) {
cu_virial->upload();
@ -739,7 +739,7 @@ void FixShakeCuda::post_force(int vflag)
if((not cuda->finished_setup)) cuda->cu_f->upload();
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
if(cuda->finished_setup)
time_postforce += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);

View File

@ -691,10 +691,10 @@ void PPPMCuda::compute(int eflag, int vflag)
cuda_shared_atom* cu_atom = & cuda->shared_data.atom;
int i;
timespec starttime;
timespec endtime;
timespec starttotal;
timespec endtotal;
my_times starttime;
my_times endtime;
my_times starttotal;
my_times endtotal;
// convert atoms from box to lamda coords
if (triclinic == 0) boxlo = domain->boxlo;
@ -726,23 +726,23 @@ void PPPMCuda::compute(int eflag, int vflag)
cu_virial->memset_device(0);
}
if(eflag) cu_energy->memset_device(0);
clock_gettime(CLOCK_REALTIME,&starttotal);
my_gettime(CLOCK_REALTIME,&starttotal);
// find grid points for all my particles
// map my particle charge onto my local 3d density grid
clock_gettime(CLOCK_REALTIME,&starttime);
my_gettime(CLOCK_REALTIME,&starttime);
particle_map();
clock_gettime(CLOCK_REALTIME,&endtime);
my_gettime(CLOCK_REALTIME,&endtime);
cuda->shared_data.cuda_timings.pppm_particle_map+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
//cu_part2grid->download();
clock_gettime(CLOCK_REALTIME,&starttime);
my_gettime(CLOCK_REALTIME,&starttime);
make_rho();
clock_gettime(CLOCK_REALTIME,&endtime);
my_gettime(CLOCK_REALTIME,&endtime);
cuda->shared_data.cuda_timings.pppm_make_rho+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
// all procs communicate density values from their ghost cells
@ -751,7 +751,7 @@ void PPPMCuda::compute(int eflag, int vflag)
int nprocs=comm->nprocs;
clock_gettime(CLOCK_REALTIME,&starttime);
my_gettime(CLOCK_REALTIME,&starttime);
if(nprocs>1)
{
@ -765,16 +765,16 @@ void PPPMCuda::compute(int eflag, int vflag)
#endif
}
clock_gettime(CLOCK_REALTIME,&endtime);
my_gettime(CLOCK_REALTIME,&endtime);
cuda->shared_data.cuda_timings.pppm_brick2fft+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
// compute potential gradient on my FFT grid and
// portion of e_long on this proc's FFT grid
// return gradients (electric fields) in 3d brick decomposition
clock_gettime(CLOCK_REALTIME,&starttime);
my_gettime(CLOCK_REALTIME,&starttime);
poisson(eflag,vflag);
clock_gettime(CLOCK_REALTIME,&endtime);
my_gettime(CLOCK_REALTIME,&endtime);
cuda->shared_data.cuda_timings.pppm_poisson+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
// all procs communicate E-field values to fill ghost cells
@ -785,14 +785,14 @@ void PPPMCuda::compute(int eflag, int vflag)
// calculate the force on my particles
clock_gettime(CLOCK_REALTIME,&starttime);
my_gettime(CLOCK_REALTIME,&starttime);
fieldforce();
clock_gettime(CLOCK_REALTIME,&endtime);
my_gettime(CLOCK_REALTIME,&endtime);
cuda->shared_data.cuda_timings.pppm_fieldforce+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
// sum energy across procs and add in volume-dependent term
clock_gettime(CLOCK_REALTIME,&endtotal);
my_gettime(CLOCK_REALTIME,&endtotal);
cuda->shared_data.cuda_timings.pppm_compute+=(endtotal.tv_sec-starttotal.tv_sec+1.0*(endtotal.tv_nsec-starttotal.tv_nsec)/1000000000);
if (eflag) {
@ -1303,14 +1303,14 @@ void PPPMCuda::poisson(int eflag, int vflag)
return;
#endif
#ifdef FFT_CUFFT
timespec starttime;
timespec endtime;
my_times starttime;
my_times endtime;
clock_gettime(CLOCK_REALTIME,&starttime);
my_gettime(CLOCK_REALTIME,&starttime);
fft1c->compute(density_fft,work1,1);
clock_gettime(CLOCK_REALTIME,&endtime);
my_gettime(CLOCK_REALTIME,&endtime);
poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
@ -1341,9 +1341,9 @@ void PPPMCuda::poisson(int eflag, int vflag)
poisson_xgrad(nx_pppm,ny_pppm,nz_pppm);
clock_gettime(CLOCK_REALTIME,&starttime);
my_gettime(CLOCK_REALTIME,&starttime);
fft2c->compute(work2,work2,-1);
clock_gettime(CLOCK_REALTIME,&endtime);
my_gettime(CLOCK_REALTIME,&endtime);
poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
poisson_vdx_brick(nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,nx_pppm,ny_pppm,nz_pppm);
@ -1353,9 +1353,9 @@ void PPPMCuda::poisson(int eflag, int vflag)
poisson_ygrad(nx_pppm,ny_pppm,nz_pppm);
clock_gettime(CLOCK_REALTIME,&starttime);
my_gettime(CLOCK_REALTIME,&starttime);
fft2c->compute(work2,work2,-1);
clock_gettime(CLOCK_REALTIME,&endtime);
my_gettime(CLOCK_REALTIME,&endtime);
poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
poisson_vdy_brick(nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,nx_pppm,ny_pppm,nz_pppm);
@ -1364,9 +1364,9 @@ void PPPMCuda::poisson(int eflag, int vflag)
poisson_zgrad(nx_pppm,ny_pppm,nz_pppm);
clock_gettime(CLOCK_REALTIME,&starttime);
my_gettime(CLOCK_REALTIME,&starttime);
fft2c->compute(work2,work2,-1);
clock_gettime(CLOCK_REALTIME,&endtime);
my_gettime(CLOCK_REALTIME,&endtime);
poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
poisson_vdz_brick(nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,nx_pppm,ny_pppm,nz_pppm);

View File

@ -173,8 +173,8 @@ void VerletCuda::setup()
if(elist_atom || vlist_atom) cuda->checkResize();
int test_BpA_vs_TpA = true;
timespec starttime;
timespec endtime;
my_times starttime;
my_times endtime;
#ifdef NO_PREC_TIMING
double startsec, endsec;
#endif
@ -201,7 +201,7 @@ void VerletCuda::setup()
#ifdef NO_PREC_TIMING
startsec = 1.0 * clock() / CLOCKS_PER_SEC;
#endif
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
for(int i = 0; i < StyleLoops; i++) {
Cuda_Pair_GenerateXType(&cuda->shared_data);
@ -216,7 +216,7 @@ void VerletCuda::setup()
CudaWrapper_Sync();
}
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
double TpAtime = endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
#ifdef NO_PREC_TIMING
@ -240,7 +240,7 @@ void VerletCuda::setup()
force->pair->compute(eflag, vflag);
CudaWrapper_Sync();
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
#ifdef NO_PREC_TIMING
startsec = 1.0 * clock() / CLOCKS_PER_SEC;
#endif
@ -258,7 +258,7 @@ void VerletCuda::setup()
CudaWrapper_Sync();
}
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
double BpAtime = endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
#ifdef NO_PREC_TIMING
endsec = 1.0 * clock() / CLOCKS_PER_SEC;
@ -586,16 +586,16 @@ void VerletCuda::run(int n)
int testatom = cuda->testatom; //48267;
timespec starttime;
timespec endtime;
timespec starttotal;
timespec endtotal;
my_times starttime;
my_times endtime;
my_times starttotal;
my_times endtotal;
cuda->setTimingsZero();
static double testtime = 0.0;
// clock_gettime(CLOCK_REALTIME,&starttime);
// clock_gettime(CLOCK_REALTIME,&endtime);
// my_gettime(CLOCK_REALTIME,&starttime);
// my_gettime(CLOCK_REALTIME,&endtime);
// testtime+=endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
// printf("Time: %lf\n",testtime);*/
@ -692,13 +692,13 @@ void VerletCuda::run(int n)
//overlap forward communication of ghost atom positions with inner force calculation (interactions between local atoms)
//build communication buffers
// printf("Pre forward_comm(1)\n");
clock_gettime(CLOCK_REALTIME, &starttotal);
my_gettime(CLOCK_REALTIME, &starttotal);
cuda->shared_data.atom.reneigh_flag = 0;
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
timer->stamp();
comm->forward_comm(1);
timer->stamp(TIME_COMM);
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
cuda->shared_data.cuda_timings.comm_forward_total +=
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
@ -721,9 +721,9 @@ void VerletCuda::run(int n)
//CudaWrapper_Sync();
//download comm buffers from GPU, perform MPI communication and upload buffers again
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
comm->forward_comm(2);
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
cuda->shared_data.cuda_timings.comm_forward_total +=
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
timer->stamp(TIME_COMM);
@ -733,9 +733,9 @@ void VerletCuda::run(int n)
timer->stamp(TIME_PAIR);
//unpack communication buffers
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
comm->forward_comm(3);
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
cuda->shared_data.cuda_timings.comm_forward_total +=
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
@ -745,9 +745,9 @@ void VerletCuda::run(int n)
endtotal.tv_sec - starttotal.tv_sec + 1.0 * (endtotal.tv_nsec - starttotal.tv_nsec) / 1000000000;
} else {
//perform standard forward communication
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
comm->forward_comm();
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
cuda->shared_data.cuda_timings.comm_forward_total +=
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
timer->stamp(TIME_COMM);
@ -791,13 +791,13 @@ void VerletCuda::run(int n)
MYDBG(printf("# CUDA VerletCuda::iterate: neighbor exchange\n");)
//perform exchange of local atoms
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
comm->exchange();
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
//special and nspecial fields of the atom data are not currently transfered via the GPU buffer might be changed in the future
if(comm->nprocs > 1) {
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
if(atom->special)
cuda->cu_special->upload();
@ -805,7 +805,7 @@ void VerletCuda::run(int n)
if(atom->nspecial)
cuda->cu_nspecial->upload();
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
cuda->shared_data.cuda_timings.test1 +=
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
}
@ -821,13 +821,13 @@ void VerletCuda::run(int n)
MYDBG(printf("# CUDA VerletCuda::iterate: neighbor borders\n");)
//generate ghost atom lists, and transfer ghost atom data
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
comm->borders();
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
cuda->shared_data.cuda_timings.comm_border_total +=
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
//atom index maps are generated on CPU, and need to be transfered to GPU if they are used
if(cuda->cu_map_array)
cuda->cu_map_array->upload();
@ -841,7 +841,7 @@ void VerletCuda::run(int n)
MYDBG(printf("# CUDA VerletCuda::iterate: neighbor build\n");)
timer->stamp(TIME_COMM);
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
cuda->shared_data.cuda_timings.test2 +=
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
@ -888,8 +888,8 @@ void VerletCuda::run(int n)
//regenerate data layout for force computations, its actually only needed for the ghost atoms
cuda->shared_data.comm.comm_phase = 2;
timespec atime1, atime2;
clock_gettime(CLOCK_REALTIME, &atime1);
my_times atime1, atime2;
my_gettime(CLOCK_REALTIME, &atime1);
Cuda_Pair_GenerateXType(&cuda->shared_data);
@ -899,7 +899,7 @@ void VerletCuda::run(int n)
if(cuda->cu_omega_rmass)
Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
clock_gettime(CLOCK_REALTIME, &atime2);
my_gettime(CLOCK_REALTIME, &atime2);
cuda->shared_data.cuda_timings.pair_xtype_conversion +=
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
force->pair->compute(eflag, vflag);
@ -909,8 +909,8 @@ void VerletCuda::run(int n)
if(not cuda->shared_data.pair.cudable_force) cuda->downloadAll();
else {
//regenerate data layout for force computations, its actually only needed for the ghost atoms
timespec atime1, atime2;
clock_gettime(CLOCK_REALTIME, &atime1);
my_times atime1, atime2;
my_gettime(CLOCK_REALTIME, &atime1);
Cuda_Pair_GenerateXType(&cuda->shared_data);
@ -920,7 +920,7 @@ void VerletCuda::run(int n)
if(cuda->cu_omega_rmass)
Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
clock_gettime(CLOCK_REALTIME, &atime2);
my_gettime(CLOCK_REALTIME, &atime2);
cuda->shared_data.cuda_timings.pair_xtype_conversion +=
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
}
@ -967,7 +967,7 @@ void VerletCuda::run(int n)
//collect forces in case pair force and bonded interactions were overlapped, and either no KSPACE or a GPU KSPACE style is used
if(cuda->shared_data.pair.collect_forces_later && cuda->shared_data.pair.cudable_force && (not(force->kspace && (not cuda->shared_data.pppm.cudable_force)))) {
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
cuda->cu_f->uploadAsync(2);
test_atom(testatom, "post molecular force");
@ -989,7 +989,7 @@ void VerletCuda::run(int n)
timer->stamp(TIME_PAIR);
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
cuda->shared_data.cuda_timings.pair_force_collection +=
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
}
@ -1020,7 +1020,7 @@ void VerletCuda::run(int n)
if(cuda->shared_data.pair.collect_forces_later && cuda->shared_data.pair.cudable_force && ((force->kspace && (not cuda->shared_data.pppm.cudable_force)))) {
cuda->cu_f->uploadAsync(2);
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
if(eflag) cuda->cu_eng_vdwl->upload();
@ -1038,7 +1038,7 @@ void VerletCuda::run(int n)
timer->stamp(TIME_PAIR);
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
cuda->shared_data.cuda_timings.pair_force_collection +=
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
}