git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@10519 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2013-08-02 15:02:54 +00:00
parent bd568f0df4
commit 9adfd0cac1
9 changed files with 111 additions and 108 deletions

View File

@ -134,7 +134,7 @@ template <const unsigned int data_mask>
int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
timespec time1, time2;
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
@ -171,14 +171,14 @@ int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* b
if(sdata->atom.nlocal > 0) {
cudaMemset(sdata->flag, 0, sizeof(int));
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
Cuda_AtomVecCuda_PackComm_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
, sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_kernel_pack +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
@ -188,7 +188,7 @@ int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* b
cudaMemcpy(buf_send, sdata->buffer, n* n_data_items* sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_forward_download +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
@ -207,7 +207,7 @@ template <const unsigned int data_mask>
int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
{
MYDBG(printf(" # CUDA: AtomVecCuda_PackComm_Self\n");)
timespec time1, time2;
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
@ -247,13 +247,13 @@ int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, in
if(sdata->atom.nlocal > 0) {
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self:Pre Kernel execution failed");
Cuda_AtomVecCuda_PackComm_Self_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_kernel_self +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
@ -267,7 +267,7 @@ int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, in
template <const unsigned int data_mask>
void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
{
timespec time1, time2;
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
@ -286,19 +286,19 @@ void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
if(not sdata->overlap_comm || iswap < 0)
cudaMemcpy(sdata->buffer, (void*)buf_recv, n_data_items * n * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_upload +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
Cuda_AtomVecCuda_UnpackComm_Kernel<data_mask> <<< grid, threads, 0>>>(n, first, buf);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_forward_kernel_unpack +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
@ -325,14 +325,14 @@ int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, v
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
timespec time1, time2;
clock_gettime(CLOCK_REALTIME, &time1);
my_times time1, time2;
my_gettime(CLOCK_REALTIME, &time1);
Cuda_AtomVecCuda_PackExchangeList_Kernel <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (n - 1, dim);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed");
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_exchange_kernel_pack +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
@ -344,7 +344,7 @@ int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, v
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: return copy failed");
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_exchange_download +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
@ -375,20 +375,20 @@ int Cuda_AtomVecCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
timespec time1, time2;
clock_gettime(CLOCK_REALTIME, &time1);
my_times time1, time2;
my_gettime(CLOCK_REALTIME, &time1);
Cuda_AtomVecCuda_PackExchange_Kernel<data_mask> <<< grid, threads, 0>>>(nsend, (int*) copylist);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchange: Kernel execution failed");
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_exchange_kernel_pack +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_exchange_download +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
@ -419,19 +419,19 @@ int Cuda_AtomVecCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* bu
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
timespec time1, time2;
clock_gettime(CLOCK_REALTIME, &time1);
my_times time1, time2;
my_gettime(CLOCK_REALTIME, &time1);
cudaMemcpy(sdata->buffer, buf_send , size, cudaMemcpyHostToDevice);
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_exchange_upload +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
Cuda_AtomVecCuda_UnpackExchange_Kernel<data_mask> <<< grid, threads, 0>>>(sdata->exchange_dim, nsend, (int*) copylist);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_exchange_kernel_unpack +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
@ -448,8 +448,8 @@ int Cuda_AtomVecCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* bu
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
timespec atime1, atime2;
clock_gettime(CLOCK_REALTIME, &atime1);
my_times atime1, atime2;
my_gettime(CLOCK_REALTIME, &atime1);
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
@ -457,7 +457,7 @@ int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, v
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
clock_gettime(CLOCK_REALTIME, &atime2);
my_gettime(CLOCK_REALTIME, &atime2);
sdata->cuda_timings.test1 +=
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
@ -489,20 +489,20 @@ int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, v
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
timespec time1, time2;
clock_gettime(CLOCK_REALTIME, &time1);
my_times time1, time2;
my_gettime(CLOCK_REALTIME, &time1);
Cuda_AtomVecCuda_PackBorder_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, nsend, sdata->comm.maxlistlength, iswap, dx, dy, dz);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_border_kernel_pack +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder: Kernel execution failed");
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_border_download +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
@ -548,13 +548,13 @@ int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap,
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
timespec time1, time2;
clock_gettime(CLOCK_REALTIME, &time1);
my_times time1, time2;
my_gettime(CLOCK_REALTIME, &time1);
Cuda_AtomVecCuda_PackBorder_Self_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_border_kernel_self +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
@ -569,8 +569,8 @@ int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap,
template <const unsigned int data_mask>
int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
{
timespec atime1, atime2;
clock_gettime(CLOCK_REALTIME, &atime1);
my_times atime1, atime2;
my_gettime(CLOCK_REALTIME, &atime1);
if(sdata->atom.update_nmax)
Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
@ -578,7 +578,7 @@ int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, voi
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
clock_gettime(CLOCK_REALTIME, &atime2);
my_gettime(CLOCK_REALTIME, &atime2);
sdata->cuda_timings.test1 +=
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
@ -594,20 +594,20 @@ int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, voi
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
timespec time1, time2;
clock_gettime(CLOCK_REALTIME, &time1);
my_times time1, time2;
my_gettime(CLOCK_REALTIME, &time1);
cudaMemset((int*)(sdata->flag), 0, sizeof(int));
cudaMemcpy(sdata->buffer, (void*)buf_recv, size, cudaMemcpyHostToDevice);
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_border_upload +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
Cuda_AtomVecCuda_UnpackBorder_Kernel<data_mask> <<< grid, threads, 0>>>(n, first);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_border_kernel_unpack +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;

View File

@ -73,7 +73,7 @@ void Cuda_CommCuda_Init(cuda_shared_data* sdata)
int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
timespec time1, time2;
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
@ -109,14 +109,14 @@ int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_
if(sdata->atom.nlocal > 0) {
cudaMemset(sdata->flag, 0, sizeof(int));
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
Cuda_CommCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
, sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_kernel_pack +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
@ -126,7 +126,7 @@ int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_
cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_forward_download +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
@ -143,7 +143,7 @@ int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_
int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
{
timespec time1, time2;
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
@ -179,14 +179,14 @@ int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* b
if(sdata->atom.nlocal > 0) {
cudaMemset(sdata->flag, 0, sizeof(int));
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
Cuda_CommCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
, sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_kernel_pack +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
@ -196,7 +196,7 @@ int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* b
cudaMemcpy(buf_send, sdata->buffer, n * 6 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
//cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_forward_download +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
@ -213,7 +213,7 @@ int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* b
int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
{
MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
timespec time1, time2;
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
@ -252,12 +252,12 @@ int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int f
if(sdata->atom.nlocal > 0) {
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
Cuda_CommCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_kernel_self +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
@ -270,7 +270,7 @@ int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int f
int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
{
MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
timespec time1, time2;
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
@ -309,12 +309,12 @@ int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, in
if(sdata->atom.nlocal > 0) {
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
Cuda_CommCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_kernel_self +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
@ -326,7 +326,7 @@ int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, in
void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
{
timespec time1, time2;
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
@ -344,19 +344,19 @@ void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* b
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
if(not sdata->overlap_comm || iswap < 0)
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_upload +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
Cuda_CommCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_forward_kernel_unpack +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
@ -367,7 +367,7 @@ void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* b
void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
{
timespec time1, time2;
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
@ -385,19 +385,19 @@ void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void
dim3 grid(layout.x, layout.y, 1);
if(sdata->atom.nlocal > 0) {
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
if(not sdata->overlap_comm || iswap < 0)
cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 6 * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_forward_upload +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
Cuda_CommCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
sdata->cuda_timings.comm_forward_kernel_unpack +=
time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
@ -489,7 +489,7 @@ void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata, int n, int iswap,
int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int ineed, int style, int atom_nfirst, int nfirst, int nlast, int dim, int iswap)
{
MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");)
timespec time1, time2;
my_times time1, time2;
if(sdata->atom.update_nmax)
Cuda_CommCuda_UpdateNmax(sdata);
@ -517,7 +517,7 @@ int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int in
cudaMemset((int*)(sdata->buffer), 0, sizeof(int));
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
if(style == 1)
Cuda_CommCuda_BuildSendlist_Single <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_FLOAT*) sdata->comm.slablo.dev_data, (X_FLOAT*) sdata->comm.slabhi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
@ -525,7 +525,7 @@ int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int in
Cuda_CommCuda_BuildSendlist_Multi <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_FLOAT*) sdata->comm.multilo.dev_data, (X_FLOAT*) sdata->comm.multihi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.comm_border_kernel_buildlist +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;

View File

@ -720,7 +720,7 @@ void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q =
CUT_CHECK_ERROR("Cuda_Pair: init failed");
}
timespec startpairtime, endpairtime;
my_times startpairtime, endpairtime;
//Function which is called prior to kernel invocation, determins grid, Binds Textures, updates constant memory if necessary
void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, dim3 &grid, dim3 &threads, int &sharedperproc, bool need_q = false, int maxthreads = 256)
{
@ -785,7 +785,7 @@ void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlis
if(sdata->pair.use_block_per_atom) sdata->pair.n_energy_virial -= 3;
clock_gettime(CLOCK_REALTIME, &startpairtime);
my_gettime(CLOCK_REALTIME, &startpairtime);
MYDBG(printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);)
}
@ -795,7 +795,7 @@ void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3 &grid, int &sh
{
if((not sdata->pair.collect_forces_later) && (eflag || vflag)) { //not sdata->comm.comm_phase==2))
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &endpairtime);
my_gettime(CLOCK_REALTIME, &endpairtime);
sdata->cuda_timings.pair_kernel +=
endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000;
CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed");
@ -986,7 +986,7 @@ void Cuda_Pair_BuildXHold(cuda_shared_data* sdata)
void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag)
{
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &endpairtime);
my_gettime(CLOCK_REALTIME, &endpairtime);
sdata->cuda_timings.pair_kernel +=
endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000;
CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed");

View File

@ -263,12 +263,15 @@ struct V_FLOAT4 {
#endif
#ifdef NO_PREC_TIMING
struct timespec_2 {
struct my_times {
unsigned int tv_sec;
unsigned int tv_nsec;
};
#define timespec timespec_2
#define clock_gettime(a,b)
#define my_gettime(a,b)
#else
#define my_times timespec
#define my_gettime(a,b) clock_gettime(a,b)
#endif
#endif /*CUDA_PRECISION_H_*/

View File

@ -175,10 +175,10 @@ void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes
{
MYDBG(printf("# CUDA: uploading %u bytes to device at dev%p from %p\n", nbytes, dev_data, host_data);)
cudaThreadSynchronize();
timespec time1, time2;
clock_gettime(CLOCK_REALTIME, &time1);
my_times time1, time2;
my_gettime(CLOCK_REALTIME, &time1);
CUDA_SAFE_CALL(cudaMemcpy(dev_data, host_data, nbytes, cudaMemcpyHostToDevice));
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
CudaWrapper_total_upload_time +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
}
@ -193,10 +193,10 @@ void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbyt
{
MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
cudaThreadSynchronize();
timespec time1, time2;
clock_gettime(CLOCK_REALTIME, &time1);
my_times time1, time2;
my_gettime(CLOCK_REALTIME, &time1);
CUDA_SAFE_CALL(cudaMemcpy(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost));
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
CudaWrapper_total_download_time +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
}

View File

@ -81,8 +81,8 @@ void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf)
void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup see cpp
{
timespec atime1, atime2;
clock_gettime(CLOCK_REALTIME, &atime1);
my_times atime1, atime2;
my_gettime(CLOCK_REALTIME, &atime1);
if(sdata->atom.update_nmax)
Cuda_FixNHCuda_UpdateNmax(sdata);
@ -90,7 +90,7 @@ void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* fa
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
clock_gettime(CLOCK_REALTIME, &atime2);
my_gettime(CLOCK_REALTIME, &atime2);
sdata->cuda_timings.test1 +=
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
@ -145,8 +145,8 @@ void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int gro
void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp
{
timespec atime1, atime2;
clock_gettime(CLOCK_REALTIME, &atime1);
my_times atime1, atime2;
my_gettime(CLOCK_REALTIME, &atime1);
if(sdata->atom.update_nmax)
Cuda_FixNHCuda_UpdateNmax(sdata);
@ -154,7 +154,7 @@ void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT fac
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
clock_gettime(CLOCK_REALTIME, &atime2);
my_gettime(CLOCK_REALTIME, &atime2);
sdata->cuda_timings.test1 +=
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
@ -171,8 +171,8 @@ void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT fac
}
void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp
{
timespec atime1, atime2;
clock_gettime(CLOCK_REALTIME, &atime1);
my_times atime1, atime2;
my_gettime(CLOCK_REALTIME, &atime1);
if(sdata->atom.update_nmax)
Cuda_FixNHCuda_UpdateNmax(sdata);
@ -180,7 +180,7 @@ void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal) /
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
clock_gettime(CLOCK_REALTIME, &atime2);
my_gettime(CLOCK_REALTIME, &atime2);
sdata->cuda_timings.test1 +=
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
@ -198,8 +198,8 @@ void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal) /
void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp
{
timespec atime1, atime2;
clock_gettime(CLOCK_REALTIME, &atime1);
my_times atime1, atime2;
my_gettime(CLOCK_REALTIME, &atime1);
if(sdata->atom.update_nmax)
Cuda_FixNHCuda_UpdateNmax(sdata);
@ -207,7 +207,7 @@ void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal) /
if(sdata->atom.update_nlocal)
cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
clock_gettime(CLOCK_REALTIME, &atime2);
my_gettime(CLOCK_REALTIME, &atime2);
sdata->cuda_timings.test1 +=
atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;

View File

@ -98,15 +98,15 @@ int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
dim3 threads(layout.z, 1, 1);
dim3 grid(layout.x, layout.y, 1);
timespec starttime, endtime;
clock_gettime(CLOCK_REALTIME, &starttime);
my_times starttime, endtime;
my_gettime(CLOCK_REALTIME, &starttime);
cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_FLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax));
Binning_Kernel <<< grid, threads>>> (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], sneighlist->bin_dim[2], rez_bin_size[0], rez_bin_size[1], rez_bin_size[2]);
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
sdata->cuda_timings.neigh_bin +=
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
@ -228,8 +228,8 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_FLOAT)) * threads.x;
MYDBG(printf("Configuration: %i %i %i %u %i\n", grid.x, grid.y, threads.x, shared_size, sneighlist->bin_nmax);)
//shared_size=2056;
timespec starttime, endtime;
clock_gettime(CLOCK_REALTIME, &starttime);
my_times starttime, endtime;
my_gettime(CLOCK_REALTIME, &starttime);
//for(int i=0;i<100;i++)
{
if(sdata->overlap_comm)
@ -250,7 +250,7 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
sdata->cuda_timings.neigh_build +=
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
//dim3 threads,grid;
@ -258,7 +258,7 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
if(buffer[0] >= 0 && true && sdata->atom.molecular) {
//printf("Find Special: %i %i\n",sneighlist->inum,sdata->atom.nall);
clock_gettime(CLOCK_REALTIME, &starttime);
my_gettime(CLOCK_REALTIME, &starttime);
int3 layout = getgrid(sdata->atom.nlocal, 0, 512);
threads.x = layout.z;
threads.y = 1;
@ -269,7 +269,7 @@ int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sn
FindSpecial <<< grid, threads>>>(sdata->pair.use_block_per_atom);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Cuda_NeighborBuild: FindSpecial kernel execution failed");
clock_gettime(CLOCK_REALTIME, &endtime);
my_gettime(CLOCK_REALTIME, &endtime);
sdata->cuda_timings.neigh_special +=
endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
}

View File

@ -100,16 +100,16 @@ void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist,
threads2.y = 1;
threads2.z = 1;
timespec time1, time2;
my_times time1, time2;
//pre-calculate all neighbordistances and zeta_ij
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
Pair_SW_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>>();
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.test1 +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
//actual force calculation
unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure
@ -130,7 +130,7 @@ void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist,
(eflag_atom, vflag_atom);
}
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.test2 +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;

View File

@ -111,20 +111,20 @@ void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneigh
threads2.y = 1;
threads2.z = 1;
timespec time1, time2;
my_times time1, time2;
//pre-calculate all neighbordistances and zeta_ij
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
Pair_Tersoff_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>>
();
cudaThreadSynchronize();
Pair_Tersoff_Kernel_TpA_ZetaIJ <<< grid2, threads2, 0, streams[1]>>>
();
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.test1 +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
clock_gettime(CLOCK_REALTIME, &time1);
my_gettime(CLOCK_REALTIME, &time1);
//actual force calculation
unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure
@ -145,7 +145,7 @@ void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneigh
(eflag_atom, vflag_atom);
}
cudaThreadSynchronize();
clock_gettime(CLOCK_REALTIME, &time2);
my_gettime(CLOCK_REALTIME, &time2);
sdata->cuda_timings.test2 +=
time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;