diff --git a/src/KSPACE/fft3d.cpp b/src/KSPACE/fft3d.cpp index 084c524f0b..50ed97fa93 100644 --- a/src/KSPACE/fft3d.cpp +++ b/src/KSPACE/fft3d.cpp @@ -16,6 +16,7 @@ Axel Kohlmeyer (Temple U) added support for FFTW3, KISSFFT, Dfti/MKL, and ACML. Phil Blood (PSC) added single precision FFT. + Paul Coffman (IBM) added MPI collectives remap ------------------------------------------------------------------------- */ #include "mpi.h" @@ -347,6 +348,7 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan) 1 = permute once = mid->fast, slow->mid, fast->slow 2 = permute twice = slow->fast, fast->mid, mid->slow nbuf returns size of internal storage buffers used by FFT + usecollective use collective MPI operations for remapping data ------------------------------------------------------------------------- */ struct fft_plan_3d *fft_3d_create_plan( @@ -355,7 +357,7 @@ struct fft_plan_3d *fft_3d_create_plan( int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, - int scaled, int permute, int *nbuf) + int scaled, int permute, int *nbuf, int usecollective) { struct fft_plan_3d *plan; int me,nprocs; @@ -430,7 +432,7 @@ struct fft_plan_3d *fft_3d_create_plan( plan->pre_plan = remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, first_ilo,first_ihi,first_jlo,first_jhi, - first_klo,first_khi,2,0,0,FFT_PRECISION); + first_klo,first_khi,2,0,0,FFT_PRECISION,0); if (plan->pre_plan == NULL) return NULL; } @@ -454,7 +456,7 @@ struct fft_plan_3d *fft_3d_create_plan( first_ilo,first_ihi,first_jlo,first_jhi, first_klo,first_khi, second_ilo,second_ihi,second_jlo,second_jhi, - second_klo,second_khi,2,1,0,FFT_PRECISION); + second_klo,second_khi,2,1,0,FFT_PRECISION,usecollective); if (plan->mid1_plan == NULL) return NULL; // 1d FFTs along mid axis @@ -496,7 +498,7 @@ struct fft_plan_3d *fft_3d_create_plan( second_jlo,second_jhi,second_klo,second_khi, second_ilo,second_ihi, third_jlo,third_jhi,third_klo,third_khi, - third_ilo,third_ihi,2,1,0,FFT_PRECISION); + third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective); if (plan->mid2_plan == NULL) return NULL; // 1d FFTs along slow axis @@ -525,7 +527,7 @@ struct fft_plan_3d *fft_3d_create_plan( third_klo,third_khi,third_ilo,third_ihi, third_jlo,third_jhi, out_klo,out_khi,out_ilo,out_ihi, - out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION); + out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0); if (plan->post_plan == NULL) return NULL; } diff --git a/src/KSPACE/fft3d.h b/src/KSPACE/fft3d.h index 692a80f9cb..34a7bcf9d6 100644 --- a/src/KSPACE/fft3d.h +++ b/src/KSPACE/fft3d.h @@ -319,7 +319,7 @@ extern "C" { struct fft_plan_3d *fft_3d_create_plan(MPI_Comm, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, - int, int, int *); + int, int, int *, int); void fft_3d_destroy_plan(struct fft_plan_3d *); void factor(int, int *, int *); void bifactor(int, int *, int *); diff --git a/src/KSPACE/fft3d_wrap.cpp b/src/KSPACE/fft3d_wrap.cpp index da8445e2b5..fb5d3e3d2b 100644 --- a/src/KSPACE/fft3d_wrap.cpp +++ b/src/KSPACE/fft3d_wrap.cpp @@ -24,12 +24,12 @@ FFT3d::FFT3d(LAMMPS *lmp, MPI_Comm comm, int nfast, int nmid, int nslow, int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, - int scaled, int permute, int *nbuf) : Pointers(lmp) + int scaled, int permute, int *nbuf, int usecollective) : Pointers(lmp) { plan = fft_3d_create_plan(comm,nfast,nmid,nslow, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, - scaled,permute,nbuf); + scaled,permute,nbuf,usecollective); if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan"); } diff --git a/src/KSPACE/fft3d_wrap.h b/src/KSPACE/fft3d_wrap.h index d324eb31c3..372dced4ef 100644 --- a/src/KSPACE/fft3d_wrap.h +++ b/src/KSPACE/fft3d_wrap.h @@ -22,7 +22,7 @@ namespace LAMMPS_NS { class FFT3d : protected Pointers { public: FFT3d(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,int,int,int, - int,int,int,int,int,int,int,int,int *); + int,int,int,int,int,int,int,int,int *,int); ~FFT3d(); void compute(FFT_SCALAR *, FFT_SCALAR *, int); void timing1d(FFT_SCALAR *, int, int); diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp index c3b54559a0..ac217a6f1d 100644 --- a/src/KSPACE/pppm.cpp +++ b/src/KSPACE/pppm.cpp @@ -800,17 +800,17 @@ void PPPM::allocate() fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp); + 0,0,&tmp,collective_flag); fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp); + 0,0,&tmp,collective_flag); remap = new Remap(lmp,world, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,0,FFT_PRECISION); + 1,0,0,FFT_PRECISION,collective_flag); // create ghost grid object for rho and electric field communication @@ -1268,8 +1268,8 @@ void PPPM::adjust_gewald() } /* ---------------------------------------------------------------------- - Calculate f(x) using Newton-Raphson solver - ------------------------------------------------------------------------- */ + calculate f(x) using Newton-Raphson solver +------------------------------------------------------------------------- */ double PPPM::newton_raphson_f() { @@ -1287,9 +1287,9 @@ double PPPM::newton_raphson_f() } /* ---------------------------------------------------------------------- - Calculate numerical derivative f'(x) using forward difference - [f(x + h) - f(x)] / h - ------------------------------------------------------------------------- */ + calculate numerical derivative f'(x) using forward difference + [f(x + h) - f(x)] / h +------------------------------------------------------------------------- */ double PPPM::derivf() { @@ -1307,7 +1307,7 @@ double PPPM::derivf() } /* ---------------------------------------------------------------------- - Calculate the final estimate of the accuracy + calculate the final estimate of the accuracy ------------------------------------------------------------------------- */ double PPPM::final_accuracy() @@ -1315,7 +1315,6 @@ double PPPM::final_accuracy() double xprd = domain->xprd; double yprd = domain->yprd; double zprd = domain->zprd; - double zprd_slab = zprd*slab_volfactor; bigint natoms = atom->natoms; double df_kspace = compute_df_kspace(); @@ -1323,7 +1322,7 @@ double PPPM::final_accuracy() double df_rspace = 2.0 * q2_over_sqrt * exp(-g_ewald*g_ewald*cutoff*cutoff); double df_table = estimate_table_accuracy(q2_over_sqrt,df_rspace); double estimated_accuracy = sqrt(df_kspace*df_kspace + df_rspace*df_rspace + - df_table*df_table); + df_table*df_table); return estimated_accuracy; } @@ -3385,7 +3384,7 @@ void PPPM::poisson_groups(int AA_flag) void PPPM::poisson_groups_triclinic() { - int i,j,k,n; + int i,n; // reuse memory (already declared) diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp index e347d528de..0750ec02e9 100755 --- a/src/KSPACE/pppm_disp.cpp +++ b/src/KSPACE/pppm_disp.cpp @@ -1652,17 +1652,17 @@ void PPPMDisp::allocate() fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp); + 0,0,&tmp,collective_flag); fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp); + 0,0,&tmp,collective_flag); remap = new Remap(lmp,world, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,0,FFT_PRECISION); + 1,0,0,FFT_PRECISION,collective_flag); // create ghost grid object for rho and electric field communication @@ -1731,17 +1731,17 @@ void PPPMDisp::allocate() fft1_6 = new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, - 0,0,&tmp); + 0,0,&tmp,collective_flag); fft2_6 = new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6, - 0,0,&tmp); + 0,0,&tmp,collective_flag); remap_6 = new Remap(lmp,world, nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, - 1,0,0,FFT_PRECISION); + 1,0,0,FFT_PRECISION,collective_flag); // create ghost grid object for rho and electric field communication @@ -1887,17 +1887,17 @@ void PPPMDisp::allocate() fft1_6 = new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, - 0,0,&tmp); + 0,0,&tmp,collective_flag); fft2_6 = new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6, - 0,0,&tmp); + 0,0,&tmp,collective_flag); remap_6 = new Remap(lmp,world, nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, - 1,0,0,FFT_PRECISION); + 1,0,0,FFT_PRECISION,collective_flag); // create ghost grid object for rho and electric field communication @@ -1967,17 +1967,17 @@ void PPPMDisp::allocate() fft1_6 = new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, - 0,0,&tmp); + 0,0,&tmp,collective_flag); fft2_6 = new FFT3d(lmp,world,nx_pppm_6,ny_pppm_6,nz_pppm_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6, - 0,0,&tmp); + 0,0,&tmp,collective_flag); remap_6 = new Remap(lmp,world, nxlo_in_6,nxhi_in_6,nylo_in_6,nyhi_in_6,nzlo_in_6,nzhi_in_6, nxlo_fft_6,nxhi_fft_6,nylo_fft_6,nyhi_fft_6,nzlo_fft_6,nzhi_fft_6, - 1,0,0,FFT_PRECISION); + 1,0,0,FFT_PRECISION,collective_flag); // create ghost grid object for rho and electric field communication diff --git a/src/KSPACE/pppm_old.cpp b/src/KSPACE/pppm_old.cpp index 22c7471b18..c2959823ff 100644 --- a/src/KSPACE/pppm_old.cpp +++ b/src/KSPACE/pppm_old.cpp @@ -839,17 +839,17 @@ void PPPMOld::allocate() fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp); + 0,0,&tmp,collective_flag); fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp); + 0,0,&tmp,collective_flag); remap = new Remap(lmp,world, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,0,FFT_PRECISION); + 1,0,0,FFT_PRECISION,collective_flag); } /* ---------------------------------------------------------------------- diff --git a/src/KSPACE/remap.cpp b/src/KSPACE/remap.cpp index a43bb2294d..51e4d1d6f2 100644 --- a/src/KSPACE/remap.cpp +++ b/src/KSPACE/remap.cpp @@ -61,51 +61,151 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf, struct remap_plan_3d *plan) - { - MPI_Status status; - int i,isend,irecv; - FFT_SCALAR *scratch; + // use point-to-point communication - if (plan->memory == 0) - scratch = buf; - else - scratch = plan->scratch; + if (!plan->usecollective) { + MPI_Status status; + int i,isend,irecv; + FFT_SCALAR *scratch; - // post all recvs into scratch space + if (plan->memory == 0) + scratch = buf; + else + scratch = plan->scratch; - for (irecv = 0; irecv < plan->nrecv; irecv++) - MPI_Irecv(&scratch[plan->recv_bufloc[irecv]],plan->recv_size[irecv], - MPI_FFT_SCALAR,plan->recv_proc[irecv],0, - plan->comm,&plan->request[irecv]); + // post all recvs into scratch space - // send all messages to other procs + for (irecv = 0; irecv < plan->nrecv; irecv++) + MPI_Irecv(&scratch[plan->recv_bufloc[irecv]],plan->recv_size[irecv], + MPI_FFT_SCALAR,plan->recv_proc[irecv],0, + plan->comm,&plan->request[irecv]); - for (isend = 0; isend < plan->nsend; isend++) { - plan->pack(&in[plan->send_offset[isend]], - plan->sendbuf,&plan->packplan[isend]); - MPI_Send(plan->sendbuf,plan->send_size[isend],MPI_FFT_SCALAR, - plan->send_proc[isend],0,plan->comm); - } + // send all messages to other procs - // copy in -> scratch -> out for self data + for (isend = 0; isend < plan->nsend; isend++) { + plan->pack(&in[plan->send_offset[isend]], + plan->sendbuf,&plan->packplan[isend]); + MPI_Send(plan->sendbuf,plan->send_size[isend],MPI_FFT_SCALAR, + plan->send_proc[isend],0,plan->comm); + } - if (plan->self) { - isend = plan->nsend; - irecv = plan->nrecv; - plan->pack(&in[plan->send_offset[isend]], - &scratch[plan->recv_bufloc[irecv]], - &plan->packplan[isend]); - plan->unpack(&scratch[plan->recv_bufloc[irecv]], - &out[plan->recv_offset[irecv]],&plan->unpackplan[irecv]); - } + // copy in -> scratch -> out for self data - // unpack all messages from scratch -> out + if (plan->self) { + isend = plan->nsend; + irecv = plan->nrecv; + plan->pack(&in[plan->send_offset[isend]], + &scratch[plan->recv_bufloc[irecv]], + &plan->packplan[isend]); + plan->unpack(&scratch[plan->recv_bufloc[irecv]], + &out[plan->recv_offset[irecv]],&plan->unpackplan[irecv]); + } - for (i = 0; i < plan->nrecv; i++) { - MPI_Waitany(plan->nrecv,plan->request,&irecv,&status); - plan->unpack(&scratch[plan->recv_bufloc[irecv]], - &out[plan->recv_offset[irecv]],&plan->unpackplan[irecv]); + // unpack all messages from scratch -> out + + for (i = 0; i < plan->nrecv; i++) { + MPI_Waitany(plan->nrecv,plan->request,&irecv,&status); + plan->unpack(&scratch[plan->recv_bufloc[irecv]], + &out[plan->recv_offset[irecv]],&plan->unpackplan[irecv]); + } + + // use All2Allv collective for remap communication + + } else { + if (plan->commringlen > 0) { + MPI_Status status; + int i,isend,irecv; + FFT_SCALAR *scratch; + + if (plan->memory == 0) scratch = buf; + else scratch = plan->scratch; + + // create send and recv buffers for alltoallv collective + + int sendBufferSize = 0; + int recvBufferSize = 0; + for (int i=0;insend;i++) + sendBufferSize += plan->send_size[i]; + for (int i=0;inrecv;i++) + recvBufferSize += plan->recv_size[i]; + + FFT_SCALAR *packedSendBuffer + = (FFT_SCALAR *) malloc(sizeof(FFT_SCALAR) * sendBufferSize); + FFT_SCALAR *packedRecvBuffer + = (FFT_SCALAR *) malloc(sizeof(FFT_SCALAR) * recvBufferSize); + + int *sendcnts = (int *) malloc(sizeof(int) * plan->commringlen); + int *rcvcnts = (int *) malloc(sizeof(int) * plan->commringlen); + int *sdispls = (int *) malloc(sizeof(int) * plan->commringlen); + int *rdispls = (int *) malloc(sizeof(int) * plan->commringlen); + int *nrecvmap = (int *) malloc(sizeof(int) * plan->commringlen); + + // create and populate send data, count and displacement buffers + + int currentSendBufferOffset = 0; + for (isend = 0; isend < plan->commringlen; isend++) { + sendcnts[isend] = 0; + sdispls[isend] = 0; + int foundentry = 0; + for (int i=0;(insend && !foundentry); i++) { + if (plan->send_proc[i] == plan->commringlist[isend]) { + foundentry = 1; + sendcnts[isend] = plan->send_size[i]; + sdispls[isend] = currentSendBufferOffset; + plan->pack(&in[plan->send_offset[i]], + &packedSendBuffer[currentSendBufferOffset], + &plan->packplan[i]); + currentSendBufferOffset += plan->send_size[i]; + } + } + } + + // create and populate recv count and displacement buffers + + int currentRecvBufferOffset = 0; + for (irecv = 0; irecv < plan->commringlen; irecv++) { + rcvcnts[irecv] = 0; + rdispls[irecv] = 0; + nrecvmap[irecv] = -1; + int foundentry = 0; + for (int i=0;(inrecv && !foundentry); i++) { + if (plan->recv_proc[i] == plan->commringlist[irecv]) { + foundentry = 1; + rcvcnts[irecv] = plan->recv_size[i]; + rdispls[irecv] = currentRecvBufferOffset; + currentRecvBufferOffset += plan->recv_size[i]; + nrecvmap[irecv] = i; + } + } + } + + int mpirc = MPI_Alltoallv(packedSendBuffer, sendcnts, sdispls, + MPI_FFT_SCALAR, packedRecvBuffer, rcvcnts, + rdispls, MPI_FFT_SCALAR, plan->comm); + + // unpack the data from the recv buffer into out + + currentRecvBufferOffset = 0; + for (irecv = 0; irecv < plan->commringlen; irecv++) { + if (nrecvmap[irecv] > -1) { + plan->unpack(&packedRecvBuffer[currentRecvBufferOffset], + &out[plan->recv_offset[nrecvmap[irecv]]], + &plan->unpackplan[nrecvmap[irecv]]); + currentRecvBufferOffset += plan->recv_size[nrecvmap[irecv]]; + } + } + + // free temporary data structures + + free(sendcnts); + free(rcvcnts); + free(sdispls); + free(rdispls); + free(nrecvmap); + free(packedSendBuffer); + free(packedRecvBuffer); + } } } @@ -131,19 +231,21 @@ void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf, precision precision of data 1 = single precision (4 bytes per datum) 2 = double precision (8 bytes per datum) + usecollective whether to use collective MPI or point-to-point ------------------------------------------------------------------------- */ struct remap_plan_3d *remap_3d_create_plan( - MPI_Comm comm, - int in_ilo, int in_ihi, int in_jlo, int in_jhi, - int in_klo, int in_khi, - int out_ilo, int out_ihi, int out_jlo, int out_jhi, - int out_klo, int out_khi, - int nqty, int permute, int memory, int precision) + MPI_Comm comm, + int in_ilo, int in_ihi, int in_jlo, int in_jhi, + int in_klo, int in_khi, + int out_ilo, int out_ihi, int out_jlo, int out_jhi, + int out_klo, int out_khi, + int nqty, int permute, int memory, int precision, int usecollective) { + struct remap_plan_3d *plan; - struct extent_3d *array; + struct extent_3d *inarray, *outarray; struct extent_3d in,out,overlap; int i,iproc,nsend,nrecv,ibuf,size,me,nprocs; @@ -156,6 +258,7 @@ struct remap_plan_3d *remap_3d_create_plan( plan = (struct remap_plan_3d *) malloc(sizeof(struct remap_plan_3d)); if (plan == NULL) return NULL; + plan->usecollective = usecollective; // store parameters in local data structs @@ -185,11 +288,14 @@ struct remap_plan_3d *remap_3d_create_plan( // combine output extents across all procs - array = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d)); - if (array == NULL) return NULL; + inarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d)); + if (inarray == NULL) return NULL; + + outarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d)); + if (outarray == NULL) return NULL; MPI_Allgather(&out,sizeof(struct extent_3d),MPI_BYTE, - array,sizeof(struct extent_3d),MPI_BYTE,comm); + outarray,sizeof(struct extent_3d),MPI_BYTE,comm); // count send collides, including self @@ -198,7 +304,7 @@ struct remap_plan_3d *remap_3d_create_plan( for (i = 0; i < nprocs; i++) { iproc++; if (iproc == nprocs) iproc = 0; - nsend += remap_3d_collide(&in,&array[iproc],&overlap); + nsend += remap_3d_collide(&in,&outarray[iproc],&overlap); } // malloc space for send info @@ -223,11 +329,11 @@ struct remap_plan_3d *remap_3d_create_plan( for (i = 0; i < nprocs; i++) { iproc++; if (iproc == nprocs) iproc = 0; - if (remap_3d_collide(&in,&array[iproc],&overlap)) { + if (remap_3d_collide(&in,&outarray[iproc],&overlap)) { plan->send_proc[nsend] = iproc; plan->send_offset[nsend] = nqty * ((overlap.klo-in.klo)*in.jsize*in.isize + - ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo)); + ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo)); plan->packplan[nsend].nfast = nqty*overlap.isize; plan->packplan[nsend].nmid = overlap.jsize; plan->packplan[nsend].nslow = overlap.ksize; @@ -241,15 +347,18 @@ struct remap_plan_3d *remap_3d_create_plan( // plan->nsend = # of sends not including self - if (nsend && plan->send_proc[nsend-1] == me) - plan->nsend = nsend - 1; - else + if (nsend && plan->send_proc[nsend-1] == me) { + if (plan->usecollective) // for collectives include self in nsend list + plan->nsend = nsend; + else + plan->nsend = nsend - 1; + } else plan->nsend = nsend; // combine input extents across all procs MPI_Allgather(&in,sizeof(struct extent_3d),MPI_BYTE, - array,sizeof(struct extent_3d),MPI_BYTE,comm); + inarray,sizeof(struct extent_3d),MPI_BYTE,comm); // count recv collides, including self @@ -258,7 +367,7 @@ struct remap_plan_3d *remap_3d_create_plan( for (i = 0; i < nprocs; i++) { iproc++; if (iproc == nprocs) iproc = 0; - nrecv += remap_3d_collide(&out,&array[iproc],&overlap); + nrecv += remap_3d_collide(&out,&inarray[iproc],&overlap); } // malloc space for recv info @@ -305,7 +414,7 @@ struct remap_plan_3d *remap_3d_create_plan( for (i = 0; i < nprocs; i++) { iproc++; if (iproc == nprocs) iproc = 0; - if (remap_3d_collide(&out,&array[iproc],&overlap)) { + if (remap_3d_collide(&out,&inarray[iproc],&overlap)) { plan->recv_proc[nrecv] = iproc; plan->recv_bufloc[nrecv] = ibuf; @@ -349,25 +458,128 @@ struct remap_plan_3d *remap_3d_create_plan( } } - // plan->nrecv = # of recvs not including self + // create sub-comm rank list - if (nrecv && plan->recv_proc[nrecv-1] == me) - plan->nrecv = nrecv - 1; - else - plan->nrecv = nrecv; + if (plan->usecollective) { + plan->commringlist = NULL; + + // merge recv and send rank lists + // ask Steve Plimpton about method to more accurately determine + // maximum number of procs contributing to pencil + + int maxcommsize = nprocs; + int *commringlist = (int *) malloc(maxcommsize*sizeof(int)); + int commringlen = 0; + + for (int i = 0; i < nrecv; i++) { + commringlist[i] = plan->recv_proc[i]; + commringlen++; + } + + for (int i = 0; i < nsend; i++) { + int foundentry = 0; + for (int j=0;jsend_proc[i]) foundentry = 1; + if (!foundentry) { + commringlist[commringlen] = plan->send_proc[i]; + commringlen++; + } + } + + // sort initial commringlist + + int swap = 0; + for (int c = 0 ; c < (commringlen - 1); c++) { + for (int d = 0 ; d < commringlen - c - 1; d++) { + if (commringlist[d] > commringlist[d+1]) { + swap = commringlist[d]; + commringlist[d] = commringlist[d+1]; + commringlist[d+1] = swap; + } + } + } + + // collide all inarray extents for the comm ring with all output + // extents and all outarray extents for the comm ring with all input + // extents - if there is a collison add the rank to the comm ring, + // keep iterating until nothing is added to commring + + int commringappend = 1; + while (commringappend) { + int newcommringlen = commringlen; + commringappend = 0; + for (int i=0;i commringlist[d+1]) { + swap = commringlist[d]; + commringlist[d] = commringlist[d+1]; + commringlist[d+1] = swap; + } + } + } + + // resize commringlist to final size + + commringlist = (int *) realloc(commringlist, commringlen*sizeof(int)); + + // set the plan->commringlist + + plan->commringlen = commringlen; + plan->commringlist = commringlist; + } + + // plan->nrecv = # of recvs not including self + // for collectives include self in the nsend list + + if (nrecv && plan->recv_proc[nrecv-1] == me) { + if (plan->usecollective) plan->nrecv = nrecv; + else plan->nrecv = nrecv - 1; + } else plan->nrecv = nrecv; // init remaining fields in remap plan plan->memory = memory; - if (nrecv == plan->nrecv) - plan->self = 0; - else - plan->self = 1; + if (nrecv == plan->nrecv) plan->self = 0; + else plan->self = 1; // free locally malloced space - free(array); + free(inarray); + free(outarray); // find biggest send message (not including self) and malloc space for it @@ -390,14 +602,34 @@ struct remap_plan_3d *remap_3d_create_plan( if (memory == 1) { if (nrecv > 0) { plan->scratch = - (FFT_SCALAR *) malloc(nqty*out.isize*out.jsize*out.ksize*sizeof(FFT_SCALAR)); + (FFT_SCALAR *) malloc(nqty*out.isize*out.jsize*out.ksize * + sizeof(FFT_SCALAR)); if (plan->scratch == NULL) return NULL; } } - // create new MPI communicator for remap + // if using collective and the commringlist is NOT empty create a + // communicator for the plan based off an MPI_Group created with + // ranks from the commringlist - MPI_Comm_dup(comm,&plan->comm); + if ((plan->usecollective && (plan->commringlen > 0))) { + MPI_Group orig_group, new_group; + MPI_Comm_group(comm, &orig_group); + MPI_Group_incl(orig_group, plan->commringlen, + plan->commringlist, &new_group); + MPI_Comm_create(comm, new_group, &plan->comm); + } + + // if using collective and the comm ring list is empty create + // a communicator for the plan with an empty group + + else if ((plan->usecollective) && (plan->commringlen == 0)) { + MPI_Comm_create(comm, MPI_GROUP_EMPTY, &plan->comm); + } + + // not using collective - dup comm + + else MPI_Comm_dup(comm,&plan->comm); // return pointer to plan @@ -409,11 +641,16 @@ struct remap_plan_3d *remap_3d_create_plan( ------------------------------------------------------------------------- */ void remap_3d_destroy_plan(struct remap_plan_3d *plan) - { // free MPI communicator - MPI_Comm_free(&plan->comm); + if (!((plan->usecollective) && (plan->commringlen == 0))) + MPI_Comm_free(&plan->comm); + + if (plan->usecollective) { + if (plan->commringlist != NULL) + free(plan->commringlist); + } // free internal arrays diff --git a/src/KSPACE/remap.h b/src/KSPACE/remap.h index 6175690686..90a327bb66 100644 --- a/src/KSPACE/remap.h +++ b/src/KSPACE/remap.h @@ -45,6 +45,9 @@ struct remap_plan_3d { int self; // whether I send/recv with myself int memory; // user provides scratch space or not MPI_Comm comm; // group of procs performing remap + int usecollective; // use collective or point-to-point MPI + int commringlen; // length of commringlist + int *commringlist; // ranks on communication ring of this plan }; // collision between 2 regions @@ -59,8 +62,9 @@ struct extent_3d { void remap_3d(FFT_SCALAR *, FFT_SCALAR *, FFT_SCALAR *, struct remap_plan_3d *); struct remap_plan_3d *remap_3d_create_plan(MPI_Comm, - int, int, int, int, int, int, int, int, int, int, int, int, - int, int, int, int); + int, int, int, int, int, int, + int, int, int, int, int, int, + int, int, int, int, int); void remap_3d_destroy_plan(struct remap_plan_3d *); int remap_3d_collide(struct extent_3d *, struct extent_3d *, struct extent_3d *); diff --git a/src/KSPACE/remap_wrap.cpp b/src/KSPACE/remap_wrap.cpp index dc159cc8f8..d790e9e7c0 100644 --- a/src/KSPACE/remap_wrap.cpp +++ b/src/KSPACE/remap_wrap.cpp @@ -24,12 +24,13 @@ Remap::Remap(LAMMPS *lmp, MPI_Comm comm, int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, - int nqty, int permute, int memory, int precision) : Pointers(lmp) + int nqty, int permute, int memory, + int precision, int usecollective) : Pointers(lmp) { plan = remap_3d_create_plan(comm, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, - nqty,permute,memory,precision); + nqty,permute,memory,precision,usecollective); if (plan == NULL) error->one(FLERR,"Could not create 3d remap plan"); } diff --git a/src/KSPACE/remap_wrap.h b/src/KSPACE/remap_wrap.h index d9f817ed42..38b2ad5ad2 100644 --- a/src/KSPACE/remap_wrap.h +++ b/src/KSPACE/remap_wrap.h @@ -22,7 +22,7 @@ namespace LAMMPS_NS { class Remap : protected Pointers { public: Remap(class LAMMPS *, MPI_Comm,int,int,int,int,int,int, - int,int,int,int,int,int,int,int,int,int); + int,int,int,int,int,int,int,int,int,int,int); ~Remap(); void perform(FFT_SCALAR *, FFT_SCALAR *, FFT_SCALAR *); diff --git a/src/USER-PHONON/fix_phonon.cpp b/src/USER-PHONON/fix_phonon.cpp index 8e22494c1c..fec3a2cee6 100644 --- a/src/USER-PHONON/fix_phonon.cpp +++ b/src/USER-PHONON/fix_phonon.cpp @@ -158,7 +158,7 @@ FixPhonon::FixPhonon(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) for (int i = 1; i < nprocs; ++i) fft_disp[i] = fft_disp[i-1]+fft_cnts[i-1]; delete []nx_loc; - fft = new FFT3d(lmp,world,nz,ny,nx,0,nz-1,0,ny-1,nxlo,nxhi,0,nz-1,0,ny-1,nxlo,nxhi,0,0,&mysize); + fft = new FFT3d(lmp,world,nz,ny,nx,0,nz-1,0,ny-1,nxlo,nxhi,0,nz-1,0,ny-1,nxlo,nxhi,0,0,&mysize,0); memory->create(fft_data, MAX(1,mynq)*2, "fix_phonon:fft_data"); // allocate variables; MAX(1,... is used because NULL buffer will result in error for MPI @@ -225,7 +225,6 @@ FixPhonon::FixPhonon(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) temperature = modify->compute[icompute]; inv_nTemp = 1.0/group->count(temperature->igroup); -return; } // end of constructor /* ---------------------------------------------------------------------- */ @@ -294,8 +293,6 @@ void FixPhonon::init() int count = 0; for (int i = 0; i < modify->nfix; ++i) if (strcmp(modify->fix[i]->style,"gfc") == 0) ++count; if (count > 1 && me == 0) error->warning(FLERR,"More than one fix phonon defined"); // just warn, but allowed. - -return; } /* ---------------------------------------------------------------------- */ @@ -318,8 +315,6 @@ void FixPhonon::setup(int flag) prev_nstep = update->ntimestep; neval = ifreq = 0; - -return; } /* ---------------------------------------------------------------------- */ @@ -422,7 +417,6 @@ void FixPhonon::end_of_step() // compute and output Phi_q after every nfreq evaluations if (++ifreq == nfreq) postprocess(); -return; } // end of end_of_step() /* ---------------------------------------------------------------------- */ @@ -459,8 +453,7 @@ int FixPhonon::modify_param(int narg, char **arg) return 2; } - -return 0; + return 0; } /* ---------------------------------------------------------------------- @@ -522,8 +515,6 @@ void FixPhonon::getmass() delete []mass_all; delete []type_one; delete []type_all; - -return; } @@ -629,8 +620,6 @@ void FixPhonon::readmap() error->one(FLERR,"The mapping info read is incorrect!"); } } - -return; } /* ---------------------------------------------------------------------- @@ -780,7 +769,6 @@ void FixPhonon::postprocess( ) fflush(flog); } -return; } // end of postprocess /* ---------------------------------------------------------------------- @@ -864,8 +852,6 @@ void FixPhonon::GaussJordan(int n, std::complex *Mat) delete []indxr; delete []indxc; delete []ipiv; - -return; } /* ---------------------------------------------------------------------- @@ -926,7 +912,5 @@ void FixPhonon::EnforceASR() } } } - -return; } /* --------------------------------------------------------------------*/ diff --git a/src/kspace.cpp b/src/kspace.cpp index 57930d557d..5966ee326e 100644 --- a/src/kspace.cpp +++ b/src/kspace.cpp @@ -44,7 +44,15 @@ KSpace::KSpace(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) gewaldflag = 0; minorder = 2; overlap_allowed = 1; - fftbench = 1; + fftbench = 0; + + // default to using MPI collectives for FFT/remap only on IBM BlueGene + +#ifdef __bg__ + collective_flag = 1; +#else + collective_flag = 0; +#endif kewaldflag = 0; @@ -356,31 +364,31 @@ void KSpace::modify_params(int narg, char **arg) while (iarg < narg) { if (strcmp(arg[iarg],"mesh") == 0) { if (iarg+4 > narg) error->all(FLERR,"Illegal kspace_modify command"); - nx_pppm = nx_msm_max = atoi(arg[iarg+1]); - ny_pppm = ny_msm_max = atoi(arg[iarg+2]); - nz_pppm = nz_msm_max = atoi(arg[iarg+3]); + nx_pppm = nx_msm_max = force->inumeric(FLERR,arg[iarg+1]); + ny_pppm = ny_msm_max = force->inumeric(FLERR,arg[iarg+2]); + nz_pppm = nz_msm_max = force->inumeric(FLERR,arg[iarg+3]); if (nx_pppm == 0 && ny_pppm == 0 && nz_pppm == 0) gridflag = 0; else gridflag = 1; iarg += 4; } else if (strcmp(arg[iarg],"mesh/disp") == 0) { if (iarg+4 > narg) error->all(FLERR,"Illegal kspace_modify command"); - nx_pppm_6 = atoi(arg[iarg+1]); - ny_pppm_6 = atoi(arg[iarg+2]); - nz_pppm_6 = atoi(arg[iarg+3]); + nx_pppm_6 = force->inumeric(FLERR,arg[iarg+1]); + ny_pppm_6 = force->inumeric(FLERR,arg[iarg+2]); + nz_pppm_6 = force->inumeric(FLERR,arg[iarg+3]); if (nx_pppm_6 == 0 || ny_pppm_6 == 0 || nz_pppm_6 == 0) gridflag_6 = 0; else gridflag_6 = 1; iarg += 4; } else if (strcmp(arg[iarg],"order") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); - order = atoi(arg[iarg+1]); + order = force->inumeric(FLERR,arg[iarg+1]); iarg += 2; } else if (strcmp(arg[iarg],"order/disp") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); - order_6 = atoi(arg[iarg+1]); + order_6 = force->inumeric(FLERR,arg[iarg+1]); iarg += 2; } else if (strcmp(arg[iarg],"minorder") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); - minorder = atoi(arg[iarg+1]); + minorder = force->inumeric(FLERR,arg[iarg+1]); if (minorder < 2) error->all(FLERR,"Illegal kspace_modify command"); iarg += 2; } else if (strcmp(arg[iarg],"overlap") == 0) { @@ -391,17 +399,17 @@ void KSpace::modify_params(int narg, char **arg) iarg += 2; } else if (strcmp(arg[iarg],"force") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); - accuracy_absolute = atof(arg[iarg+1]); + accuracy_absolute = force->numeric(FLERR,arg[iarg+1]); iarg += 2; } else if (strcmp(arg[iarg],"gewald") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); - g_ewald = atof(arg[iarg+1]); + g_ewald = force->numeric(FLERR,arg[iarg+1]); if (g_ewald == 0.0) gewaldflag = 0; else gewaldflag = 1; iarg += 2; } else if (strcmp(arg[iarg],"gewald/disp") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); - g_ewald_6 = atof(arg[iarg+1]); + g_ewald_6 = force->numeric(FLERR,arg[iarg+1]); if (g_ewald_6 == 0.0) gewaldflag_6 = 0; else gewaldflag_6 = 1; iarg += 2; @@ -411,7 +419,7 @@ void KSpace::modify_params(int narg, char **arg) slabflag = 2; } else { slabflag = 1; - slab_volfactor = atof(arg[iarg+1]); + slab_volfactor = force->numeric(FLERR,arg[iarg+1]); if (slab_volfactor <= 1.0) error->all(FLERR,"Bad kspace_modify slab parameter"); if (slab_volfactor < 2.0 && comm->me == 0) @@ -431,6 +439,12 @@ void KSpace::modify_params(int narg, char **arg) else if (strcmp(arg[iarg+1],"no") == 0) fftbench = 0; else error->all(FLERR,"Illegal kspace_modify command"); iarg += 2; + } else if (strcmp(arg[iarg],"collective") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); + if (strcmp(arg[iarg+1],"yes") == 0) collective_flag = 1; + else if (strcmp(arg[iarg+1],"no") == 0) collective_flag = 0; + else error->all(FLERR,"Illegal kspace_modify command"); + iarg += 2; } else if (strcmp(arg[iarg],"diff") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command"); if (strcmp(arg[iarg+1],"ad") == 0) differentiation_flag = 1; diff --git a/src/kspace.h b/src/kspace.h index ecf3e97205..ed5b93ee07 100644 --- a/src/kspace.h +++ b/src/kspace.h @@ -73,7 +73,7 @@ class KSpace : protected Pointers { int compute_flag; // 0 if skip compute() int fftbench; // 0 if skip FFT timing - + int collective_flag; // 1 if use MPI collectives for FFT/remap int stagger_flag; // 1 if using staggered PPPM grids double splittol; // tolerance for when to truncate the splitting