From d8efe2a7d32f2ebd4d7e1377eadc90033a1be376 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Fri, 9 Nov 2012 18:53:01 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@9044 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/GPU/pppm_gpu.cpp | 678 +++++++++++++------------------------------ src/GPU/pppm_gpu.h | 13 +- src/KSPACE/pppm.h | 8 +- src/read_data.cpp | 12 +- 4 files changed, 220 insertions(+), 491 deletions(-) diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp index 4724d516cd..a8138067de 100644 --- a/src/GPU/pppm_gpu.cpp +++ b/src/GPU/pppm_gpu.cpp @@ -24,6 +24,7 @@ #include "pppm_gpu.h" #include "atom.h" #include "comm.h" +#include "commgrid.h" #include "neighbor.h" #include "force.h" #include "pair.h" @@ -48,6 +49,9 @@ using namespace MathConst; #define LARGE 10000.0 #define EPS_HOC 1.0e-7 +enum{REVERSE_RHO}; +enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM}; + #ifdef FFT_SINGLE #define ZEROF 0.0f #define ONEF 1.0f @@ -56,7 +60,7 @@ using namespace MathConst; #define ONEF 1.0 #endif -// External functions from cuda library for atom decomposition +// external functions from cuda library for atom decomposition #ifdef FFT_SINGLE #define PPPM_GPU_API(api) pppm_gpu_ ## api ## _f @@ -111,10 +115,35 @@ PPPMGPU::~PPPMGPU() void PPPMGPU::init() { + // PPPM init manages all arrays except density_brick_gpu and vd_brick + // thru its deallocate(), allocate() + // NOTE: could free density_brick and vdxyz_brick after PPPM allocates them, + // before allocating db_gpu and vd_brick down below, if don't need, + // if do this, make sure to set them to NULL + // NOTE: delete/realloc of cg necessary b/c packing 4 values per grid pt, + // not 3 as PPPM does - probably a better way to account for this + // in PPPM::init() + + destroy_3d_offset(density_brick_gpu,nzlo_out,nylo_out); + destroy_3d_offset(vd_brick,nzlo_out,nylo_out); + density_brick_gpu = vd_brick = NULL; + PPPM::init(); + if (differentiation_flag == 0) { + delete cg; + int (*procneigh)[2] = comm->procneigh; + cg = new CommGrid(lmp,world,4,1, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out, + procneigh[0][0],procneigh[0][1],procneigh[1][0], + procneigh[1][1],procneigh[2][0],procneigh[2][1]); + } + + // unsupported option + if (differentiation_flag == 1) - error->all(FLERR,"Cannot (yet) do analytic differentiation with pppm/gpu."); + error->all(FLERR,"Cannot (yet) do analytic differentiation with pppm/gpu"); if (strcmp(update->integrate_style,"verlet/split") == 0) { kspace_split=true; @@ -142,6 +171,8 @@ void PPPMGPU::init() GPU_EXTRA::check_flag(success,error,world); + // allocate density_brick_gpu and vd_brick + density_brick_gpu = create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out, nxlo_out,nxhi_out,"pppm:density_brick_gpu",h_brick,1); @@ -149,7 +180,7 @@ void PPPMGPU::init() create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out, nxlo_out,nxhi_out,"pppm:vd_brick",data,4); - poisson_time=0; + poisson_time = 0.0; } /* ---------------------------------------------------------------------- @@ -164,10 +195,8 @@ void PPPMGPU::compute(int eflag, int vflag) if (atom->nlocal > old_nlocal) { nago=0; old_nlocal = atom->nlocal; - } else - nago=1; - } else - nago=neighbor->ago; + } else nago = 1; + } else nago = neighbor->ago; // set energy/virial flags // invoke allocate_peratom() if needed for first time @@ -211,12 +240,13 @@ void PPPMGPU::compute(int eflag, int vflag) domain->x2lamda(atom->nlocal); } - double t3=MPI_Wtime(); + double t3 = MPI_Wtime(); // all procs communicate density values from their ghost cells // to fully sum contribution in their 3d bricks // remap from 3d decomposition to FFT decomposition + cg->reverse_comm(this,REVERSE_RHO); brick2fft(); // compute potential gradient on my FFT grid and @@ -225,23 +255,32 @@ void PPPMGPU::compute(int eflag, int vflag) poisson(); - // all procs communicate E-field values to fill ghost cells - // surrounding their 3d bricks + // all procs communicate E-field values + // to fill ghost cells surrounding their 3d bricks - fillbrick(); + if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD); + else cg->forward_comm(this,FORWARD_IK); - poisson_time+=MPI_Wtime()-t3; + // extra per-atom energy/virial communication + + if (evflag_atom) { + if (differentiation_flag == 1 && vflag_atom) + cg_peratom->forward_comm(this,FORWARD_AD_PERATOM); + else if (differentiation_flag == 0) + cg_peratom->forward_comm(this,FORWARD_IK_PERATOM); + } + + poisson_time += MPI_Wtime()-t3; // calculate the force on my particles FFT_SCALAR qscale = force->qqrd2e * scale; PPPM_GPU_API(interp)(qscale); - // Compute per-atom energy/virial on host if requested + // per-atom energy/virial + // energy includes self-energy correction if (evflag_atom) { - fillbrick_peratom(); - fieldforce_peratom(); double *q = atom->q; int nlocal = atom->nlocal; @@ -293,246 +332,13 @@ void PPPMGPU::compute(int eflag, int vflag) } /* ---------------------------------------------------------------------- - allocate memory that depends on # of K-vectors and order -------------------------------------------------------------------------- */ - -void PPPMGPU::allocate() -{ - memory->create(density_fft,nfft_both,"pppm:density_fft"); - memory->create(greensfn,nfft_both,"pppm:greensfn"); - memory->create(work1,2*nfft_both,"pppm:work1"); - memory->create(work2,2*nfft_both,"pppm:work2"); - memory->create(vg,nfft_both,6,"pppm:vg"); - - memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm:fkx"); - memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm:fky"); - memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm:fkz"); - - memory->create(buf1,nbuf,"pppm:buf1"); - memory->create(buf2,nbuf,"pppm:buf2"); - - // summation coeffs - - memory->create(gf_b,order,"pppm:gf_b"); - memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d"); - memory->create2d_offset(drho1d,3,-order/2,order/2,"pppm:drho1d"); - memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff"); - memory->create2d_offset(drho_coeff,order,(1-order)/2,order/2, - "pppm:drho_coeff"); - - // create 2 FFTs and a Remap - // 1st FFT keeps data in FFT decompostion - // 2nd FFT returns data in 3d brick decomposition - // remap takes data from 3d brick to FFT decomposition - - int tmp; - - fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, - nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp); - - fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, - nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp); - - remap = new Remap(lmp,world, - nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,0,FFT_PRECISION); -} - -/* ---------------------------------------------------------------------- - deallocate memory that depends on # of K-vectors and order -------------------------------------------------------------------------- */ - -void PPPMGPU::deallocate() -{ - destroy_3d_offset(density_brick_gpu,nzlo_out,nylo_out); - destroy_3d_offset(vd_brick,nzlo_out,nylo_out); - - memory->destroy(density_fft); - memory->destroy(greensfn); - memory->destroy(work1); - memory->destroy(work2); - memory->destroy(vg); - - memory->destroy1d_offset(fkx,nxlo_fft); - memory->destroy1d_offset(fky,nylo_fft); - memory->destroy1d_offset(fkz,nzlo_fft); - - memory->destroy(buf1); - memory->destroy(buf2); - - memory->destroy(gf_b); - memory->destroy2d_offset(rho1d,-order/2); - memory->destroy2d_offset(drho1d,-order/2); - memory->destroy2d_offset(rho_coeff,(1-order)/2); - memory->destroy2d_offset(drho_coeff,(1-order)/2); - - delete fft1; - delete fft2; - delete remap; -} - - -/* ---------------------------------------------------------------------- - ghost-swap to accumulate full density in brick decomposition remap density from 3d brick decomposition to FFT decomposition ------------------------------------------------------------------------- */ void PPPMGPU::brick2fft() { - int i,n,ix,iy,iz; - MPI_Request request; - MPI_Status status; + int n,ix,iy,iz; - // pack my ghosts for +x processor - // pass data to self or +x processor - // unpack and sum recv data into my real cells - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy <= nyhi_out; iy++) - for (ix = nxhi_in+1; ix <= nxhi_out; ix++) - buf1[n++] = density_brick_gpu[iz][iy][ix]; - - if (comm->procneigh[0][1] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world,&request); - MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy <= nyhi_out; iy++) - for (ix = nxlo_in; ix < nxlo_in+nxlo_ghost; ix++) - density_brick_gpu[iz][iy][ix] += buf2[n++]; - - // pack my ghosts for -x processor - // pass data to self or -x processor - // unpack and sum recv data into my real cells - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy <= nyhi_out; iy++) - for (ix = nxlo_out; ix < nxlo_in; ix++) - buf1[n++] = density_brick_gpu[iz][iy][ix]; - - if (comm->procneigh[0][0] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world,&request); - MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy <= nyhi_out; iy++) - for (ix = nxhi_in-nxhi_ghost+1; ix <= nxhi_in; ix++) - density_brick_gpu[iz][iy][ix] += buf2[n++]; - - // pack my ghosts for +y processor - // pass data to self or +y processor - // unpack and sum recv data into my real cells - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nyhi_in+1; iy <= nyhi_out; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - buf1[n++] = density_brick_gpu[iz][iy][ix]; - - if (comm->procneigh[1][1] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world,&request); - MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - density_brick_gpu[iz][iy][ix] += buf2[n++]; - - // pack my ghosts for -y processor - // pass data to self or -y processor - // unpack and sum recv data into my real cells - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy < nylo_in; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - buf1[n++] = density_brick_gpu[iz][iy][ix]; - - if (comm->procneigh[1][0] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world,&request); - MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - density_brick_gpu[iz][iy][ix] += buf2[n++]; - - // pack my ghosts for +z processor - // pass data to self or +z processor - // unpack and sum recv data into my real cells - - n = 0; - for (iz = nzhi_in+1; iz <= nzhi_out; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - buf1[n++] = density_brick_gpu[iz][iy][ix]; - - if (comm->procneigh[2][1] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world,&request); - MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - density_brick_gpu[iz][iy][ix] += buf2[n++]; - - // pack my ghosts for -z processor - // pass data to self or -z processor - // unpack and sum recv data into my real cells - - n = 0; - for (iz = nzlo_out; iz < nzlo_in; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - buf1[n++] = density_brick_gpu[iz][iy][ix]; - - if (comm->procneigh[2][0] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world,&request); - MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - density_brick_gpu[iz][iy][ix] += buf2[n++]; - - // remap from 3d brick decomposition to FFT decomposition // copy grabs inner portion of density from 3d brick // remap could be done as pre-stage of FFT, // but this works optimally on only double values, not complex values @@ -546,216 +352,6 @@ void PPPMGPU::brick2fft() remap->perform(density_fft,density_fft,work1); } -/* ---------------------------------------------------------------------- - Same as base class - needed to call GPU version of fillbrick_. -------------------------------------------------------------------------- */ - -void PPPMGPU::fillbrick() -{ - if (differentiation_flag == 1) fillbrick_ad(); - else fillbrick_ik(); -} - -/* ---------------------------------------------------------------------- - ghost-swap to fill ghost cells of my brick with field values -------------------------------------------------------------------------- */ - -void PPPMGPU::fillbrick_ik() -{ - int i,n,ix,iy,iz; - MPI_Request request; - MPI_Status status; - - // pack my real cells for +z processor - // pass data to self or +z processor - // unpack and sum recv data into my ghost cells - - n = 0; - int x_lo = nxlo_in * 4; - int x_hi = nxhi_in * 4 + 1; - for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - buf1[n++] = vd_brick[iz][iy][ix]; - buf1[n++] = vd_brick[iz][iy][ix+1]; - buf1[n++] = vd_brick[iz][iy][ix+2]; - } - - if (comm->procneigh[2][1] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world,&request); - MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzlo_out; iz < nzlo_in; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - vd_brick[iz][iy][ix] = buf2[n++]; - vd_brick[iz][iy][ix+1] = buf2[n++]; - vd_brick[iz][iy][ix+2] = buf2[n++]; - } - - // pack my real cells for -z processor - // pass data to self or -z processor - // unpack and sum recv data into my ghost cells - - n = 0; - for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - buf1[n++] = vd_brick[iz][iy][ix]; - buf1[n++] = vd_brick[iz][iy][ix+1]; - buf1[n++] = vd_brick[iz][iy][ix+2]; - } - - if (comm->procneigh[2][0] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world,&request); - MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzhi_in+1; iz <= nzhi_out; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - vd_brick[iz][iy][ix] = buf2[n++]; - vd_brick[iz][iy][ix+1] = buf2[n++]; - vd_brick[iz][iy][ix+2] = buf2[n++]; - } - - // pack my real cells for +y processor - // pass data to self or +y processor - // unpack and sum recv data into my ghost cells - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - buf1[n++] = vd_brick[iz][iy][ix]; - buf1[n++] = vd_brick[iz][iy][ix+1]; - buf1[n++] = vd_brick[iz][iy][ix+2]; - } - - if (comm->procneigh[1][1] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world,&request); - MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy < nylo_in; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - vd_brick[iz][iy][ix] = buf2[n++]; - vd_brick[iz][iy][ix+1] = buf2[n++]; - vd_brick[iz][iy][ix+2] = buf2[n++]; - } - - // pack my real cells for -y processor - // pass data to self or -y processor - // unpack and sum recv data into my ghost cells - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - buf1[n++] = vd_brick[iz][iy][ix]; - buf1[n++] = vd_brick[iz][iy][ix+1]; - buf1[n++] = vd_brick[iz][iy][ix+2]; - } - - if (comm->procneigh[1][0] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world,&request); - MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nyhi_in+1; iy <= nyhi_out; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - vd_brick[iz][iy][ix] = buf2[n++]; - vd_brick[iz][iy][ix+1] = buf2[n++]; - vd_brick[iz][iy][ix+2] = buf2[n++]; - } - - // pack my real cells for +x processor - // pass data to self or +x processor - // unpack and sum recv data into my ghost cells - - n = 0; - x_lo = (nxhi_in-nxhi_ghost+1) * 4; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy <= nyhi_out; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - buf1[n++] = vd_brick[iz][iy][ix]; - buf1[n++] = vd_brick[iz][iy][ix+1]; - buf1[n++] = vd_brick[iz][iy][ix+2]; - } - - if (comm->procneigh[0][1] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world,&request); - MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - x_lo = nxlo_out * 4; - x_hi = nxlo_in * 4; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy <= nyhi_out; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - vd_brick[iz][iy][ix] = buf2[n++]; - vd_brick[iz][iy][ix+1] = buf2[n++]; - vd_brick[iz][iy][ix+2] = buf2[n++]; - } - - // pack my real cells for -x processor - // pass data to self or -x processor - // unpack and sum recv data into my ghost cells - - n = 0; - x_lo = x_hi; - x_hi = (nxlo_in+nxlo_ghost) * 4; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy <= nyhi_out; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - buf1[n++] = vd_brick[iz][iy][ix]; - buf1[n++] = vd_brick[iz][iy][ix+1]; - buf1[n++] = vd_brick[iz][iy][ix+2]; - } - - if (comm->procneigh[0][0] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world,&request); - MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - x_lo = (nxhi_in + 1) * 4; - x_hi = nxhi_out * 4 + 1; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy <= nyhi_out; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - vd_brick[iz][iy][ix] = buf2[n++]; - vd_brick[iz][iy][ix+1] = buf2[n++]; - vd_brick[iz][iy][ix+2] = buf2[n++]; - } -} - /* ---------------------------------------------------------------------- Same code as base class - necessary to call GPU version of poisson_ik ------------------------------------------------------------------------- */ @@ -892,12 +488,154 @@ void PPPMGPU::poisson_ik() } /* ---------------------------------------------------------------------- - Create array using offsets from pinned memory allocation + pack own values to buf to send to another proc +------------------------------------------------------------------------- */ + +void PPPMGPU::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list) +{ + int n = 0; + + if (flag == FORWARD_IK) { + int offset; + FFT_SCALAR *src = &vd_brick[nzlo_out][nylo_out][4*nxlo_out]; + for (int i = 0; i < nlist; i++) { + offset = 4*list[i]; + buf[n++] = src[offset++]; + buf[n++] = src[offset++]; + buf[n++] = src[offset]; + } + } else if (flag == FORWARD_AD) { + FFT_SCALAR *src = &u_brick[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) + buf[i] = src[list[i]]; + } else if (flag == FORWARD_IK_PERATOM) { + FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) { + if (eflag_atom) buf[n++] = esrc[list[i]]; + if (vflag_atom) { + buf[n++] = v0src[list[i]]; + buf[n++] = v1src[list[i]]; + buf[n++] = v2src[list[i]]; + buf[n++] = v3src[list[i]]; + buf[n++] = v4src[list[i]]; + buf[n++] = v5src[list[i]]; + } + } + } else if (flag == FORWARD_AD_PERATOM) { + FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) { + buf[n++] = v0src[list[i]]; + buf[n++] = v1src[list[i]]; + buf[n++] = v2src[list[i]]; + buf[n++] = v3src[list[i]]; + buf[n++] = v4src[list[i]]; + buf[n++] = v5src[list[i]]; + } + } +} + +/* ---------------------------------------------------------------------- + unpack another proc's own values from buf and set own ghost values +------------------------------------------------------------------------- */ + +void PPPMGPU::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list) +{ + int n = 0; + + if (flag == FORWARD_IK) { + int offset; + FFT_SCALAR *dest = &vdx_brick[nzlo_out][nylo_out][4*nxlo_out]; + for (int i = 0; i < nlist; i++) { + offset = 4*list[i]; + dest[offset++] = buf[n++]; + dest[offset++] = buf[n++]; + dest[offset] = buf[n++]; + } + } else if (flag == FORWARD_AD) { + FFT_SCALAR *dest = &u_brick[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) + dest[list[i]] = buf[n++]; + } else if (flag == FORWARD_IK_PERATOM) { + FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) { + if (eflag_atom) esrc[list[i]] = buf[n++]; + if (vflag_atom) { + v0src[list[i]] = buf[n++]; + v1src[list[i]] = buf[n++]; + v2src[list[i]] = buf[n++]; + v3src[list[i]] = buf[n++]; + v4src[list[i]] = buf[n++]; + v5src[list[i]] = buf[n++]; + } + } + } else if (flag == FORWARD_AD_PERATOM) { + FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) { + v0src[list[i]] = buf[n++]; + v1src[list[i]] = buf[n++]; + v2src[list[i]] = buf[n++]; + v3src[list[i]] = buf[n++]; + v4src[list[i]] = buf[n++]; + v5src[list[i]] = buf[n++]; + } + } +} + +/* ---------------------------------------------------------------------- + pack ghost values into buf to send to another proc +------------------------------------------------------------------------- */ + +void PPPMGPU::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list) +{ + if (flag == REVERSE_RHO) { + FFT_SCALAR *src = &density_brick_gpu[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) + buf[i] = src[list[i]]; + } +} + +/* ---------------------------------------------------------------------- + unpack another proc's ghost values from buf and add to own values +------------------------------------------------------------------------- */ + +void PPPMGPU::unpack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list) +{ + if (flag == REVERSE_RHO) { + FFT_SCALAR *dest = &density_brick_gpu[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) + dest[list[i]] += buf[i]; + } +} + +/* ---------------------------------------------------------------------- + create array using offsets from pinned memory allocation ------------------------------------------------------------------------- */ FFT_SCALAR ***PPPMGPU::create_3d_offset(int n1lo, int n1hi, int n2lo, int n2hi, - int n3lo, int n3hi, const char *name, - FFT_SCALAR *data, int vec_length) + int n3lo, int n3hi, const char *name, + FFT_SCALAR *data, int vec_length) { int i,j; int n1 = n1hi - n1lo + 1; @@ -942,19 +680,11 @@ void PPPMGPU::destroy_3d_offset(FFT_SCALAR ***array, int n1_offset, double PPPMGPU::memory_usage() { - double bytes = nmax*3 * sizeof(double); - int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * - (nzhi_out-nzlo_out+1); - bytes += 4 * nbrick * sizeof(FFT_SCALAR); - bytes += 6 * nfft_both * sizeof(double); - bytes += nfft_both * sizeof(double); - bytes += nfft_both*5 * sizeof(FFT_SCALAR); - bytes += 2 * nbuf * sizeof(double); + double bytes = PPPM::memory_usage(); - if (peratom_allocate_flag) { - bytes += 7 * nbrick * sizeof(FFT_SCALAR); - bytes += 2 * nbuf_peratom * sizeof(FFT_SCALAR); - } + // NOTE: add tallying here for density_brick_gpu and vd_brick + // could subtract out density_brick and vdxyz_brick if freed them above + // it the net efffect is zero, do nothing return bytes + PPPM_GPU_API(bytes)(); } diff --git a/src/GPU/pppm_gpu.h b/src/GPU/pppm_gpu.h index 4c9e8441a9..98d62ffa24 100644 --- a/src/GPU/pppm_gpu.h +++ b/src/GPU/pppm_gpu.h @@ -35,20 +35,19 @@ class PPPMGPU : public PPPM { double memory_usage(); protected: - FFT_SCALAR ***density_brick_gpu, ***vd_brick; bool kspace_split, im_real_space; + int old_nlocal; + double poisson_time; - void allocate(); - void deallocate(); void brick2fft(); - void fillbrick(); - void fillbrick_ik(); void poisson(); void poisson_ik(); - int old_nlocal; - double poisson_time; + void pack_forward(int, FFT_SCALAR *, int, int *); + void unpack_forward(int, FFT_SCALAR *, int, int *); + void pack_reverse(int, FFT_SCALAR *, int, int *); + void unpack_reverse(int, FFT_SCALAR *, int, int *); FFT_SCALAR ***create_3d_offset(int, int, int, int, int, int, const char *, FFT_SCALAR *, int); diff --git a/src/KSPACE/pppm.h b/src/KSPACE/pppm.h index 2f5dc2241d..a7d0dfb2cd 100644 --- a/src/KSPACE/pppm.h +++ b/src/KSPACE/pppm.h @@ -150,10 +150,10 @@ class PPPM : public KSpace { // grid communication - void pack_forward(int, FFT_SCALAR *, int, int *); - void unpack_forward(int, FFT_SCALAR *, int, int *); - void pack_reverse(int, FFT_SCALAR *, int, int *); - void unpack_reverse(int, FFT_SCALAR *, int, int *); + virtual void pack_forward(int, FFT_SCALAR *, int, int *); + virtual void unpack_forward(int, FFT_SCALAR *, int, int *); + virtual void pack_reverse(int, FFT_SCALAR *, int, int *); + virtual void unpack_reverse(int, FFT_SCALAR *, int, int *); // group-group interactions diff --git a/src/read_data.cpp b/src/read_data.cpp index 658ac11caa..d14d5084ad 100644 --- a/src/read_data.cpp +++ b/src/read_data.cpp @@ -921,7 +921,7 @@ void ReadData::mass() eof = fgets(&buf[m],MAXLINE,fp); if (eof == NULL) error->one(FLERR,"Unexpected end of data file"); m += strlen(&buf[m]); - if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n"); + if (buf[m-1] != '\n') strcpy(&buf[m++],"\n"); buf[m-1] = '\0'; } } @@ -951,7 +951,7 @@ void ReadData::paircoeffs() eof = fgets(&buf[m],MAXLINE,fp); if (eof == NULL) error->one(FLERR,"Unexpected end of data file"); m += strlen(&buf[m]); - if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n"); + if (buf[m-1] != '\n') strcpy(&buf[m++],"\n"); buf[m-1] = '\0'; } } @@ -983,7 +983,7 @@ void ReadData::bondcoeffs() eof = fgets(&buf[m],MAXLINE,fp); if (eof == NULL) error->one(FLERR,"Unexpected end of data file"); m += strlen(&buf[m]); - if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n"); + if (buf[m-1] != '\n') strcpy(&buf[m++],"\n"); buf[m-1] = '\0'; } } @@ -1015,7 +1015,7 @@ void ReadData::anglecoeffs(int which) eof = fgets(&buf[m],MAXLINE,fp); if (eof == NULL) error->one(FLERR,"Unexpected end of data file"); m += strlen(&buf[m]); - if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n"); + if (buf[m-1] != '\n') strcpy(&buf[m++],"\n"); buf[m-1] = '\0'; } } @@ -1049,7 +1049,7 @@ void ReadData::dihedralcoeffs(int which) eof = fgets(&buf[m],MAXLINE,fp); if (eof == NULL) error->one(FLERR,"Unexpected end of data file"); m += strlen(&buf[m]); - if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n"); + if (buf[m-1] != '\n') strcpy(&buf[m++],"\n"); buf[m-1] = '\0'; } } @@ -1086,7 +1086,7 @@ void ReadData::impropercoeffs(int which) eof = fgets(&buf[m],MAXLINE,fp); if (eof == NULL) error->one(FLERR,"Unexpected end of data file"); m += strlen(&buf[m]); - if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n"); + if (buf[m-1] != '\n') strcpy(&buf[m++],"\n"); buf[m-1] = '\0'; } }