git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@9044 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2012-11-09 18:53:01 +00:00
parent ac51e677ab
commit d8efe2a7d3
4 changed files with 220 additions and 491 deletions

View File

@ -24,6 +24,7 @@
#include "pppm_gpu.h"
#include "atom.h"
#include "comm.h"
#include "commgrid.h"
#include "neighbor.h"
#include "force.h"
#include "pair.h"
@ -48,6 +49,9 @@ using namespace MathConst;
#define LARGE 10000.0
#define EPS_HOC 1.0e-7
enum{REVERSE_RHO};
enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM};
#ifdef FFT_SINGLE
#define ZEROF 0.0f
#define ONEF 1.0f
@ -56,7 +60,7 @@ using namespace MathConst;
#define ONEF 1.0
#endif
// External functions from cuda library for atom decomposition
// external functions from cuda library for atom decomposition
#ifdef FFT_SINGLE
#define PPPM_GPU_API(api) pppm_gpu_ ## api ## _f
@ -111,10 +115,35 @@ PPPMGPU::~PPPMGPU()
void PPPMGPU::init()
{
// PPPM init manages all arrays except density_brick_gpu and vd_brick
// thru its deallocate(), allocate()
// NOTE: could free density_brick and vdxyz_brick after PPPM allocates them,
// before allocating db_gpu and vd_brick down below, if don't need,
// if do this, make sure to set them to NULL
// NOTE: delete/realloc of cg necessary b/c packing 4 values per grid pt,
// not 3 as PPPM does - probably a better way to account for this
// in PPPM::init()
destroy_3d_offset(density_brick_gpu,nzlo_out,nylo_out);
destroy_3d_offset(vd_brick,nzlo_out,nylo_out);
density_brick_gpu = vd_brick = NULL;
PPPM::init();
if (differentiation_flag == 0) {
delete cg;
int (*procneigh)[2] = comm->procneigh;
cg = new CommGrid(lmp,world,4,1,
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
procneigh[0][0],procneigh[0][1],procneigh[1][0],
procneigh[1][1],procneigh[2][0],procneigh[2][1]);
}
// unsupported option
if (differentiation_flag == 1)
error->all(FLERR,"Cannot (yet) do analytic differentiation with pppm/gpu.");
error->all(FLERR,"Cannot (yet) do analytic differentiation with pppm/gpu");
if (strcmp(update->integrate_style,"verlet/split") == 0) {
kspace_split=true;
@ -142,6 +171,8 @@ void PPPMGPU::init()
GPU_EXTRA::check_flag(success,error,world);
// allocate density_brick_gpu and vd_brick
density_brick_gpu =
create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
nxlo_out,nxhi_out,"pppm:density_brick_gpu",h_brick,1);
@ -149,7 +180,7 @@ void PPPMGPU::init()
create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
nxlo_out,nxhi_out,"pppm:vd_brick",data,4);
poisson_time=0;
poisson_time = 0.0;
}
/* ----------------------------------------------------------------------
@ -164,10 +195,8 @@ void PPPMGPU::compute(int eflag, int vflag)
if (atom->nlocal > old_nlocal) {
nago=0;
old_nlocal = atom->nlocal;
} else
nago=1;
} else
nago=neighbor->ago;
} else nago = 1;
} else nago = neighbor->ago;
// set energy/virial flags
// invoke allocate_peratom() if needed for first time
@ -211,12 +240,13 @@ void PPPMGPU::compute(int eflag, int vflag)
domain->x2lamda(atom->nlocal);
}
double t3=MPI_Wtime();
double t3 = MPI_Wtime();
// all procs communicate density values from their ghost cells
// to fully sum contribution in their 3d bricks
// remap from 3d decomposition to FFT decomposition
cg->reverse_comm(this,REVERSE_RHO);
brick2fft();
// compute potential gradient on my FFT grid and
@ -225,23 +255,32 @@ void PPPMGPU::compute(int eflag, int vflag)
poisson();
// all procs communicate E-field values to fill ghost cells
// surrounding their 3d bricks
// all procs communicate E-field values
// to fill ghost cells surrounding their 3d bricks
fillbrick();
if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
else cg->forward_comm(this,FORWARD_IK);
poisson_time+=MPI_Wtime()-t3;
// extra per-atom energy/virial communication
if (evflag_atom) {
if (differentiation_flag == 1 && vflag_atom)
cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
else if (differentiation_flag == 0)
cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
}
poisson_time += MPI_Wtime()-t3;
// calculate the force on my particles
FFT_SCALAR qscale = force->qqrd2e * scale;
PPPM_GPU_API(interp)(qscale);
// Compute per-atom energy/virial on host if requested
// per-atom energy/virial
// energy includes self-energy correction
if (evflag_atom) {
fillbrick_peratom();
fieldforce_peratom();
double *q = atom->q;
int nlocal = atom->nlocal;
@ -293,246 +332,13 @@ void PPPMGPU::compute(int eflag, int vflag)
}
/* ----------------------------------------------------------------------
allocate memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */
void PPPMGPU::allocate()
{
memory->create(density_fft,nfft_both,"pppm:density_fft");
memory->create(greensfn,nfft_both,"pppm:greensfn");
memory->create(work1,2*nfft_both,"pppm:work1");
memory->create(work2,2*nfft_both,"pppm:work2");
memory->create(vg,nfft_both,6,"pppm:vg");
memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm:fkx");
memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm:fky");
memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm:fkz");
memory->create(buf1,nbuf,"pppm:buf1");
memory->create(buf2,nbuf,"pppm:buf2");
// summation coeffs
memory->create(gf_b,order,"pppm:gf_b");
memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d");
memory->create2d_offset(drho1d,3,-order/2,order/2,"pppm:drho1d");
memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff");
memory->create2d_offset(drho_coeff,order,(1-order)/2,order/2,
"pppm:drho_coeff");
// create 2 FFTs and a Remap
// 1st FFT keeps data in FFT decompostion
// 2nd FFT returns data in 3d brick decomposition
// remap takes data from 3d brick to FFT decomposition
int tmp;
fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
0,0,&tmp);
fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
0,0,&tmp);
remap = new Remap(lmp,world,
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
1,0,0,FFT_PRECISION);
}
/* ----------------------------------------------------------------------
deallocate memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */
void PPPMGPU::deallocate()
{
destroy_3d_offset(density_brick_gpu,nzlo_out,nylo_out);
destroy_3d_offset(vd_brick,nzlo_out,nylo_out);
memory->destroy(density_fft);
memory->destroy(greensfn);
memory->destroy(work1);
memory->destroy(work2);
memory->destroy(vg);
memory->destroy1d_offset(fkx,nxlo_fft);
memory->destroy1d_offset(fky,nylo_fft);
memory->destroy1d_offset(fkz,nzlo_fft);
memory->destroy(buf1);
memory->destroy(buf2);
memory->destroy(gf_b);
memory->destroy2d_offset(rho1d,-order/2);
memory->destroy2d_offset(drho1d,-order/2);
memory->destroy2d_offset(rho_coeff,(1-order)/2);
memory->destroy2d_offset(drho_coeff,(1-order)/2);
delete fft1;
delete fft2;
delete remap;
}
/* ----------------------------------------------------------------------
ghost-swap to accumulate full density in brick decomposition
remap density from 3d brick decomposition to FFT decomposition
------------------------------------------------------------------------- */
void PPPMGPU::brick2fft()
{
int i,n,ix,iy,iz;
MPI_Request request;
MPI_Status status;
int n,ix,iy,iz;
// pack my ghosts for +x processor
// pass data to self or +x processor
// unpack and sum recv data into my real cells
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy <= nyhi_out; iy++)
for (ix = nxhi_in+1; ix <= nxhi_out; ix++)
buf1[n++] = density_brick_gpu[iz][iy][ix];
if (comm->procneigh[0][1] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy <= nyhi_out; iy++)
for (ix = nxlo_in; ix < nxlo_in+nxlo_ghost; ix++)
density_brick_gpu[iz][iy][ix] += buf2[n++];
// pack my ghosts for -x processor
// pass data to self or -x processor
// unpack and sum recv data into my real cells
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy <= nyhi_out; iy++)
for (ix = nxlo_out; ix < nxlo_in; ix++)
buf1[n++] = density_brick_gpu[iz][iy][ix];
if (comm->procneigh[0][0] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy <= nyhi_out; iy++)
for (ix = nxhi_in-nxhi_ghost+1; ix <= nxhi_in; ix++)
density_brick_gpu[iz][iy][ix] += buf2[n++];
// pack my ghosts for +y processor
// pass data to self or +y processor
// unpack and sum recv data into my real cells
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nyhi_in+1; iy <= nyhi_out; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
buf1[n++] = density_brick_gpu[iz][iy][ix];
if (comm->procneigh[1][1] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
density_brick_gpu[iz][iy][ix] += buf2[n++];
// pack my ghosts for -y processor
// pass data to self or -y processor
// unpack and sum recv data into my real cells
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy < nylo_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
buf1[n++] = density_brick_gpu[iz][iy][ix];
if (comm->procneigh[1][0] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
density_brick_gpu[iz][iy][ix] += buf2[n++];
// pack my ghosts for +z processor
// pass data to self or +z processor
// unpack and sum recv data into my real cells
n = 0;
for (iz = nzhi_in+1; iz <= nzhi_out; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
buf1[n++] = density_brick_gpu[iz][iy][ix];
if (comm->procneigh[2][1] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
density_brick_gpu[iz][iy][ix] += buf2[n++];
// pack my ghosts for -z processor
// pass data to self or -z processor
// unpack and sum recv data into my real cells
n = 0;
for (iz = nzlo_out; iz < nzlo_in; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
buf1[n++] = density_brick_gpu[iz][iy][ix];
if (comm->procneigh[2][0] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++)
density_brick_gpu[iz][iy][ix] += buf2[n++];
// remap from 3d brick decomposition to FFT decomposition
// copy grabs inner portion of density from 3d brick
// remap could be done as pre-stage of FFT,
// but this works optimally on only double values, not complex values
@ -546,216 +352,6 @@ void PPPMGPU::brick2fft()
remap->perform(density_fft,density_fft,work1);
}
/* ----------------------------------------------------------------------
Same as base class - needed to call GPU version of fillbrick_.
------------------------------------------------------------------------- */
void PPPMGPU::fillbrick()
{
if (differentiation_flag == 1) fillbrick_ad();
else fillbrick_ik();
}
/* ----------------------------------------------------------------------
ghost-swap to fill ghost cells of my brick with field values
------------------------------------------------------------------------- */
void PPPMGPU::fillbrick_ik()
{
int i,n,ix,iy,iz;
MPI_Request request;
MPI_Status status;
// pack my real cells for +z processor
// pass data to self or +z processor
// unpack and sum recv data into my ghost cells
n = 0;
int x_lo = nxlo_in * 4;
int x_hi = nxhi_in * 4 + 1;
for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
buf1[n++] = vd_brick[iz][iy][ix];
buf1[n++] = vd_brick[iz][iy][ix+1];
buf1[n++] = vd_brick[iz][iy][ix+2];
}
if (comm->procneigh[2][1] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzlo_out; iz < nzlo_in; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
vd_brick[iz][iy][ix] = buf2[n++];
vd_brick[iz][iy][ix+1] = buf2[n++];
vd_brick[iz][iy][ix+2] = buf2[n++];
}
// pack my real cells for -z processor
// pass data to self or -z processor
// unpack and sum recv data into my ghost cells
n = 0;
for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
buf1[n++] = vd_brick[iz][iy][ix];
buf1[n++] = vd_brick[iz][iy][ix+1];
buf1[n++] = vd_brick[iz][iy][ix+2];
}
if (comm->procneigh[2][0] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzhi_in+1; iz <= nzhi_out; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
vd_brick[iz][iy][ix] = buf2[n++];
vd_brick[iz][iy][ix+1] = buf2[n++];
vd_brick[iz][iy][ix+2] = buf2[n++];
}
// pack my real cells for +y processor
// pass data to self or +y processor
// unpack and sum recv data into my ghost cells
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
buf1[n++] = vd_brick[iz][iy][ix];
buf1[n++] = vd_brick[iz][iy][ix+1];
buf1[n++] = vd_brick[iz][iy][ix+2];
}
if (comm->procneigh[1][1] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy < nylo_in; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
vd_brick[iz][iy][ix] = buf2[n++];
vd_brick[iz][iy][ix+1] = buf2[n++];
vd_brick[iz][iy][ix+2] = buf2[n++];
}
// pack my real cells for -y processor
// pass data to self or -y processor
// unpack and sum recv data into my ghost cells
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
buf1[n++] = vd_brick[iz][iy][ix];
buf1[n++] = vd_brick[iz][iy][ix+1];
buf1[n++] = vd_brick[iz][iy][ix+2];
}
if (comm->procneigh[1][0] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world);
MPI_Wait(&request,&status);
}
n = 0;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nyhi_in+1; iy <= nyhi_out; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
vd_brick[iz][iy][ix] = buf2[n++];
vd_brick[iz][iy][ix+1] = buf2[n++];
vd_brick[iz][iy][ix+2] = buf2[n++];
}
// pack my real cells for +x processor
// pass data to self or +x processor
// unpack and sum recv data into my ghost cells
n = 0;
x_lo = (nxhi_in-nxhi_ghost+1) * 4;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy <= nyhi_out; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
buf1[n++] = vd_brick[iz][iy][ix];
buf1[n++] = vd_brick[iz][iy][ix+1];
buf1[n++] = vd_brick[iz][iy][ix+2];
}
if (comm->procneigh[0][1] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world);
MPI_Wait(&request,&status);
}
n = 0;
x_lo = nxlo_out * 4;
x_hi = nxlo_in * 4;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy <= nyhi_out; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
vd_brick[iz][iy][ix] = buf2[n++];
vd_brick[iz][iy][ix+1] = buf2[n++];
vd_brick[iz][iy][ix+2] = buf2[n++];
}
// pack my real cells for -x processor
// pass data to self or -x processor
// unpack and sum recv data into my ghost cells
n = 0;
x_lo = x_hi;
x_hi = (nxlo_in+nxlo_ghost) * 4;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy <= nyhi_out; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
buf1[n++] = vd_brick[iz][iy][ix];
buf1[n++] = vd_brick[iz][iy][ix+1];
buf1[n++] = vd_brick[iz][iy][ix+2];
}
if (comm->procneigh[0][0] == me)
for (i = 0; i < n; i++) buf2[i] = buf1[i];
else {
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world,&request);
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world);
MPI_Wait(&request,&status);
}
n = 0;
x_lo = (nxhi_in + 1) * 4;
x_hi = nxhi_out * 4 + 1;
for (iz = nzlo_out; iz <= nzhi_out; iz++)
for (iy = nylo_out; iy <= nyhi_out; iy++)
for (ix = x_lo; ix < x_hi; ix+=4) {
vd_brick[iz][iy][ix] = buf2[n++];
vd_brick[iz][iy][ix+1] = buf2[n++];
vd_brick[iz][iy][ix+2] = buf2[n++];
}
}
/* ----------------------------------------------------------------------
Same code as base class - necessary to call GPU version of poisson_ik
------------------------------------------------------------------------- */
@ -892,12 +488,154 @@ void PPPMGPU::poisson_ik()
}
/* ----------------------------------------------------------------------
Create array using offsets from pinned memory allocation
pack own values to buf to send to another proc
------------------------------------------------------------------------- */
void PPPMGPU::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
{
int n = 0;
if (flag == FORWARD_IK) {
int offset;
FFT_SCALAR *src = &vd_brick[nzlo_out][nylo_out][4*nxlo_out];
for (int i = 0; i < nlist; i++) {
offset = 4*list[i];
buf[n++] = src[offset++];
buf[n++] = src[offset++];
buf[n++] = src[offset];
}
} else if (flag == FORWARD_AD) {
FFT_SCALAR *src = &u_brick[nzlo_out][nylo_out][nxlo_out];
for (int i = 0; i < nlist; i++)
buf[i] = src[list[i]];
} else if (flag == FORWARD_IK_PERATOM) {
FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
for (int i = 0; i < nlist; i++) {
if (eflag_atom) buf[n++] = esrc[list[i]];
if (vflag_atom) {
buf[n++] = v0src[list[i]];
buf[n++] = v1src[list[i]];
buf[n++] = v2src[list[i]];
buf[n++] = v3src[list[i]];
buf[n++] = v4src[list[i]];
buf[n++] = v5src[list[i]];
}
}
} else if (flag == FORWARD_AD_PERATOM) {
FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
for (int i = 0; i < nlist; i++) {
buf[n++] = v0src[list[i]];
buf[n++] = v1src[list[i]];
buf[n++] = v2src[list[i]];
buf[n++] = v3src[list[i]];
buf[n++] = v4src[list[i]];
buf[n++] = v5src[list[i]];
}
}
}
/* ----------------------------------------------------------------------
unpack another proc's own values from buf and set own ghost values
------------------------------------------------------------------------- */
void PPPMGPU::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
{
int n = 0;
if (flag == FORWARD_IK) {
int offset;
FFT_SCALAR *dest = &vdx_brick[nzlo_out][nylo_out][4*nxlo_out];
for (int i = 0; i < nlist; i++) {
offset = 4*list[i];
dest[offset++] = buf[n++];
dest[offset++] = buf[n++];
dest[offset] = buf[n++];
}
} else if (flag == FORWARD_AD) {
FFT_SCALAR *dest = &u_brick[nzlo_out][nylo_out][nxlo_out];
for (int i = 0; i < nlist; i++)
dest[list[i]] = buf[n++];
} else if (flag == FORWARD_IK_PERATOM) {
FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
for (int i = 0; i < nlist; i++) {
if (eflag_atom) esrc[list[i]] = buf[n++];
if (vflag_atom) {
v0src[list[i]] = buf[n++];
v1src[list[i]] = buf[n++];
v2src[list[i]] = buf[n++];
v3src[list[i]] = buf[n++];
v4src[list[i]] = buf[n++];
v5src[list[i]] = buf[n++];
}
}
} else if (flag == FORWARD_AD_PERATOM) {
FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
for (int i = 0; i < nlist; i++) {
v0src[list[i]] = buf[n++];
v1src[list[i]] = buf[n++];
v2src[list[i]] = buf[n++];
v3src[list[i]] = buf[n++];
v4src[list[i]] = buf[n++];
v5src[list[i]] = buf[n++];
}
}
}
/* ----------------------------------------------------------------------
pack ghost values into buf to send to another proc
------------------------------------------------------------------------- */
void PPPMGPU::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
{
if (flag == REVERSE_RHO) {
FFT_SCALAR *src = &density_brick_gpu[nzlo_out][nylo_out][nxlo_out];
for (int i = 0; i < nlist; i++)
buf[i] = src[list[i]];
}
}
/* ----------------------------------------------------------------------
unpack another proc's ghost values from buf and add to own values
------------------------------------------------------------------------- */
void PPPMGPU::unpack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
{
if (flag == REVERSE_RHO) {
FFT_SCALAR *dest = &density_brick_gpu[nzlo_out][nylo_out][nxlo_out];
for (int i = 0; i < nlist; i++)
dest[list[i]] += buf[i];
}
}
/* ----------------------------------------------------------------------
create array using offsets from pinned memory allocation
------------------------------------------------------------------------- */
FFT_SCALAR ***PPPMGPU::create_3d_offset(int n1lo, int n1hi, int n2lo, int n2hi,
int n3lo, int n3hi, const char *name,
FFT_SCALAR *data, int vec_length)
int n3lo, int n3hi, const char *name,
FFT_SCALAR *data, int vec_length)
{
int i,j;
int n1 = n1hi - n1lo + 1;
@ -942,19 +680,11 @@ void PPPMGPU::destroy_3d_offset(FFT_SCALAR ***array, int n1_offset,
double PPPMGPU::memory_usage()
{
double bytes = nmax*3 * sizeof(double);
int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
(nzhi_out-nzlo_out+1);
bytes += 4 * nbrick * sizeof(FFT_SCALAR);
bytes += 6 * nfft_both * sizeof(double);
bytes += nfft_both * sizeof(double);
bytes += nfft_both*5 * sizeof(FFT_SCALAR);
bytes += 2 * nbuf * sizeof(double);
double bytes = PPPM::memory_usage();
if (peratom_allocate_flag) {
bytes += 7 * nbrick * sizeof(FFT_SCALAR);
bytes += 2 * nbuf_peratom * sizeof(FFT_SCALAR);
}
// NOTE: add tallying here for density_brick_gpu and vd_brick
// could subtract out density_brick and vdxyz_brick if freed them above
// it the net efffect is zero, do nothing
return bytes + PPPM_GPU_API(bytes)();
}

View File

@ -35,20 +35,19 @@ class PPPMGPU : public PPPM {
double memory_usage();
protected:
FFT_SCALAR ***density_brick_gpu, ***vd_brick;
bool kspace_split, im_real_space;
int old_nlocal;
double poisson_time;
void allocate();
void deallocate();
void brick2fft();
void fillbrick();
void fillbrick_ik();
void poisson();
void poisson_ik();
int old_nlocal;
double poisson_time;
void pack_forward(int, FFT_SCALAR *, int, int *);
void unpack_forward(int, FFT_SCALAR *, int, int *);
void pack_reverse(int, FFT_SCALAR *, int, int *);
void unpack_reverse(int, FFT_SCALAR *, int, int *);
FFT_SCALAR ***create_3d_offset(int, int, int, int, int, int, const char *,
FFT_SCALAR *, int);

View File

@ -150,10 +150,10 @@ class PPPM : public KSpace {
// grid communication
void pack_forward(int, FFT_SCALAR *, int, int *);
void unpack_forward(int, FFT_SCALAR *, int, int *);
void pack_reverse(int, FFT_SCALAR *, int, int *);
void unpack_reverse(int, FFT_SCALAR *, int, int *);
virtual void pack_forward(int, FFT_SCALAR *, int, int *);
virtual void unpack_forward(int, FFT_SCALAR *, int, int *);
virtual void pack_reverse(int, FFT_SCALAR *, int, int *);
virtual void unpack_reverse(int, FFT_SCALAR *, int, int *);
// group-group interactions

View File

@ -921,7 +921,7 @@ void ReadData::mass()
eof = fgets(&buf[m],MAXLINE,fp);
if (eof == NULL) error->one(FLERR,"Unexpected end of data file");
m += strlen(&buf[m]);
if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n");
if (buf[m-1] != '\n') strcpy(&buf[m++],"\n");
buf[m-1] = '\0';
}
}
@ -951,7 +951,7 @@ void ReadData::paircoeffs()
eof = fgets(&buf[m],MAXLINE,fp);
if (eof == NULL) error->one(FLERR,"Unexpected end of data file");
m += strlen(&buf[m]);
if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n");
if (buf[m-1] != '\n') strcpy(&buf[m++],"\n");
buf[m-1] = '\0';
}
}
@ -983,7 +983,7 @@ void ReadData::bondcoeffs()
eof = fgets(&buf[m],MAXLINE,fp);
if (eof == NULL) error->one(FLERR,"Unexpected end of data file");
m += strlen(&buf[m]);
if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n");
if (buf[m-1] != '\n') strcpy(&buf[m++],"\n");
buf[m-1] = '\0';
}
}
@ -1015,7 +1015,7 @@ void ReadData::anglecoeffs(int which)
eof = fgets(&buf[m],MAXLINE,fp);
if (eof == NULL) error->one(FLERR,"Unexpected end of data file");
m += strlen(&buf[m]);
if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n");
if (buf[m-1] != '\n') strcpy(&buf[m++],"\n");
buf[m-1] = '\0';
}
}
@ -1049,7 +1049,7 @@ void ReadData::dihedralcoeffs(int which)
eof = fgets(&buf[m],MAXLINE,fp);
if (eof == NULL) error->one(FLERR,"Unexpected end of data file");
m += strlen(&buf[m]);
if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n");
if (buf[m-1] != '\n') strcpy(&buf[m++],"\n");
buf[m-1] = '\0';
}
}
@ -1086,7 +1086,7 @@ void ReadData::impropercoeffs(int which)
eof = fgets(&buf[m],MAXLINE,fp);
if (eof == NULL) error->one(FLERR,"Unexpected end of data file");
m += strlen(&buf[m]);
if (buffer[m-1] != '\n') strcpy(&buffer[m++],"\n");
if (buf[m-1] != '\n') strcpy(&buf[m++],"\n");
buf[m-1] = '\0';
}
}