From e1f70c3a98c0b2fb18e24205062ddeaed6e0f715 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Thu, 8 Dec 2011 22:31:22 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7314 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/STUBS/mpi.cpp | 25 +- src/STUBS/mpi.h | 7 +- src/comm.cpp | 593 ++++++++++++++++++++++++++++++++++------------ src/comm.h | 20 +- src/math_extra.h | 6 +- 5 files changed, 487 insertions(+), 164 deletions(-) diff --git a/src/STUBS/mpi.cpp b/src/STUBS/mpi.cpp index 35dab3b1d5..e3d7f6eff1 100644 --- a/src/STUBS/mpi.cpp +++ b/src/STUBS/mpi.cpp @@ -415,8 +415,8 @@ int MPI_Gather(void *sendbuf, int sendcount, MPI_Datatype sendtype, /* copy values from data1 to data2 */ int MPI_Gatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype, - void *recvbuf, int *recvcounts, int *displs, - MPI_Datatype recvtype, int root, MPI_Comm comm) + void *recvbuf, int *recvcounts, int *displs, + MPI_Datatype recvtype, int root, MPI_Comm comm) { int n; if (sendtype == MPI_INT) n = sendcount*sizeof(int); @@ -430,3 +430,24 @@ int MPI_Gatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype, memcpy(recvbuf,sendbuf,n); return 0; } + +/* ---------------------------------------------------------------------- */ + +/* copy values from data1 to data2 */ + +int MPI_Scatterv(void *sendbuf, int *sendcounts, int *displs, + MPI_Datatype sendtype, void *recvbuf, int recvcount, + MPI_Datatype recvtype, int root, MPI_Comm comm) +{ + int n; + if (sendtype == MPI_INT) n = recvcount*sizeof(int); + else if (sendtype == MPI_FLOAT) n = recvcount*sizeof(float); + else if (sendtype == MPI_DOUBLE) n = recvcount*sizeof(double); + else if (sendtype == MPI_CHAR) n = recvcount*sizeof(char); + else if (sendtype == MPI_BYTE) n = recvcount*sizeof(char); + else if (sendtype == MPI_LONG_LONG) n = recvcount*sizeof(uint64_t); + else if (sendtype == MPI_DOUBLE_INT) n = recvcount*sizeof(double_int); + + memcpy(recvbuf,sendbuf,n); + return 0; +} diff --git a/src/STUBS/mpi.h b/src/STUBS/mpi.h index 4a4ce5644e..bebf65316a 100644 --- a/src/STUBS/mpi.h +++ b/src/STUBS/mpi.h @@ -115,7 +115,10 @@ int MPI_Gather(void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm); int MPI_Gatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype, - void *recvbuf, int *recvcounts, int *displs, - MPI_Datatype recvtype, int root, MPI_Comm comm); + void *recvbuf, int *recvcounts, int *displs, + MPI_Datatype recvtype, int root, MPI_Comm comm); +int MPI_Scatterv(void *sendbuf, int *sendcounts, int *displs, + MPI_Datatype sendtype, void *recvbuf, int recvcount, + MPI_Datatype recvtype, int root, MPI_Comm comm); #endif diff --git a/src/comm.cpp b/src/comm.cpp index a1ca10cdf7..c5c4a17403 100644 --- a/src/comm.cpp +++ b/src/comm.cpp @@ -35,9 +35,13 @@ #include "compute.h" #include "output.h" #include "dump.h" +#include "math_extra.h" #include "error.h" #include "memory.h" +#include +#include + #ifdef _OPENMP #include "omp.h" #endif @@ -51,6 +55,7 @@ using namespace LAMMPS_NS; enum{SINGLE,MULTI}; enum{NONE,MULTIPLE}; +enum{CART,CARTREORDER,XYZ,XZY,YXZ,YZX,ZXY,ZYX}; /* ---------------------------------------------------------------------- setup MPI and allocate buffer space @@ -63,6 +68,8 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp) user_procgrid[0] = user_procgrid[1] = user_procgrid[2] = 0; grid2proc = NULL; + layoutflag = CART; + numa_nodes = 0; bordergroup = 0; style = SINGLE; @@ -131,61 +138,7 @@ Comm::~Comm() } /* ---------------------------------------------------------------------- - set dimensions of processor grid - invoked from input script by processors command -------------------------------------------------------------------------- */ - -void Comm::set_processors(int narg, char **arg) -{ - if (narg < 3) error->all(FLERR,"Illegal processors command"); - - if (strcmp(arg[0],"*") == 0) user_procgrid[0] = 0; - else user_procgrid[0] = atoi(arg[0]); - if (strcmp(arg[1],"*") == 0) user_procgrid[1] = 0; - else user_procgrid[1] = atoi(arg[1]); - if (strcmp(arg[2],"*") == 0) user_procgrid[2] = 0; - else user_procgrid[2] = atoi(arg[2]); - - if (user_procgrid[0] < 0 || user_procgrid[1] < 0 || user_procgrid[2] < 0) - error->all(FLERR,"Illegal processors command"); - - int iarg = 3; - while (iarg < narg) { - if (strcmp(arg[iarg],"part") == 0) { - if (iarg+4 > narg) error->all(FLERR,"Illegal processors command"); - if (universe->nworlds == 1) - error->all(FLERR, - "Cannot use processors part command " - "without using partitions"); - int isend = atoi(arg[iarg+1]); - int irecv = atoi(arg[iarg+2]); - if (isend < 1 || isend > universe->nworlds || - irecv < 1 || irecv > universe->nworlds || isend == irecv) - error->all(FLERR,"Invalid partitions in processors part command"); - if (isend-1 == universe->iworld) { - if (send_to_partition >= 0) - error->all(FLERR, - "Sending partition in processors part command " - "is already a sender"); - send_to_partition = irecv-1; - } - if (irecv-1 == universe->iworld) { - if (recv_from_partition >= 0) - error->all(FLERR, - "Receiving partition in processors part command " - "is already a receiver"); - recv_from_partition = isend-1; - } - if (strcmp(arg[iarg+3],"multiple") == 0) { - if (universe->iworld == irecv-1) other_partition_style = MULTIPLE; - } else error->all(FLERR,"Illegal processors command"); - iarg += 4; - } else error->all(FLERR,"Illegal processors command"); - } -} - -/* ---------------------------------------------------------------------- - setup 3d grid of procs based on box size + create 3d grid of procs based on box size ------------------------------------------------------------------------- */ void Comm::set_proc_grid() @@ -200,9 +153,18 @@ void Comm::set_proc_grid() MPI_Bcast(other_procgrid,3,MPI_INT,0,world); } - // create layout of procs mapped to simulation box + // use NUMA routines if numa_nodes > 0 + // if NUMA routines fail, just continue - procs2box(); + if (numa_nodes) + if (numa_set_proc_grid()) return; + + // create layout of procs mapped to simulation box + // can fail (on one partition) if constrained by other_partition_style + + int flag = + procs2box(nprocs,user_procgrid,procgrid,1,1,1,other_partition_style); + if (!flag) error->all(FLERR,"Could not layout grid of processors",1); // error check @@ -211,35 +173,89 @@ void Comm::set_proc_grid() if (domain->dimension == 2 && procgrid[2] != 1) error->all(FLERR,"Processor count in z must be 1 for 2d simulation"); + // grid2proc[i][j][k] = proc that owns i,j,k location in grid + if (grid2proc) memory->destroy(grid2proc); memory->create(grid2proc,procgrid[0],procgrid[1],procgrid[2], "comm:grid2proc"); - // use MPI Cartesian routines to setup 3d grid of procs - // grid2proc[i][j][k] = proc that owns i,j,k location in grid - // let MPI compute it instead of LAMMPS in case it is machine optimized + // use MPI Cartesian routines to layout 3d grid of procs + // MPI may do layout in machine-optimized fashion - int reorder = 0; - int periods[3]; - periods[0] = periods[1] = periods[2] = 1; - MPI_Comm cartesian; + if (layoutflag == CART || layoutflag == CARTREORDER) { + int reorder = 0; + if (layoutflag == CARTREORDER) reorder = 1; + int periods[3]; + periods[0] = periods[1] = periods[2] = 1; + MPI_Comm cartesian; - MPI_Cart_create(world,3,procgrid,periods,reorder,&cartesian); - MPI_Cart_get(cartesian,3,procgrid,periods,myloc); - MPI_Cart_shift(cartesian,0,1,&procneigh[0][0],&procneigh[0][1]); - MPI_Cart_shift(cartesian,1,1,&procneigh[1][0],&procneigh[1][1]); - MPI_Cart_shift(cartesian,2,1,&procneigh[2][0],&procneigh[2][1]); + MPI_Cart_create(world,3,procgrid,periods,reorder,&cartesian); + MPI_Cart_get(cartesian,3,procgrid,periods,myloc); + MPI_Cart_shift(cartesian,0,1,&procneigh[0][0],&procneigh[0][1]); + MPI_Cart_shift(cartesian,1,1,&procneigh[1][0],&procneigh[1][1]); + MPI_Cart_shift(cartesian,2,1,&procneigh[2][0],&procneigh[2][1]); - int coords[3]; - int i,j,k; - for (i = 0; i < procgrid[0]; i++) - for (j = 0; j < procgrid[1]; j++) - for (k = 0; k < procgrid[2]; k++) { - coords[0] = i; coords[1] = j; coords[2] = k; - MPI_Cart_rank(cartesian,coords,&grid2proc[i][j][k]); - } + int coords[3]; + int i,j,k; + for (i = 0; i < procgrid[0]; i++) + for (j = 0; j < procgrid[1]; j++) + for (k = 0; k < procgrid[2]; k++) { + coords[0] = i; coords[1] = j; coords[2] = k; + MPI_Cart_rank(cartesian,coords,&grid2proc[i][j][k]); + } + + MPI_Comm_free(&cartesian); - MPI_Comm_free(&cartesian); + // layout 3d grid of procs explicitly, via user-requested XYZ ordering + + } else { + int i,j,k,ime,jme,kme; + for (i = 0; i < procgrid[0]; i++) + for (j = 0; j < procgrid[1]; j++) + for (k = 0; k < procgrid[2]; k++) { + if (layoutflag == XYZ) + grid2proc[i][j][k] = k*procgrid[1]*procgrid[0] + j*procgrid[0] + i; + else if (layoutflag == XZY) + grid2proc[i][j][k] = j*procgrid[2]*procgrid[0] + k*procgrid[0] + i; + else if (layoutflag == YXZ) + grid2proc[i][j][k] = k*procgrid[0]*procgrid[1] + i*procgrid[1] + j; + else if (layoutflag == YZX) + grid2proc[i][j][k] = i*procgrid[2]*procgrid[1] + k*procgrid[1] + j; + else if (layoutflag == ZXY) + grid2proc[i][j][k] = j*procgrid[0]*procgrid[2] + i*procgrid[2] + k; + else if (layoutflag == ZYX) + grid2proc[i][j][k] = i*procgrid[1]*procgrid[2] + j*procgrid[2] + k; + + if (grid2proc[i][j][k] == me) { + ime = i; jme = j, kme = k; + } + } + + myloc[0] = ime; + myloc[1] = jme; + myloc[2] = kme; + + i = ime-1; + if (i < 0) i = procgrid[0]-1; + procneigh[0][0] = grid2proc[i][jme][kme]; + i = ime+1; + if (i == procgrid[0]) i = 0; + procneigh[0][1] = grid2proc[i][jme][kme]; + + j = jme-1; + if (j < 0) j = procgrid[1]-1; + procneigh[1][0] = grid2proc[ime][j][kme]; + j = jme+1; + if (j == procgrid[1]) j = 0; + procneigh[1][1] = grid2proc[ime][j][kme]; + + k = kme-1; + if (k < 0) k = procgrid[2]-1; + procneigh[2][0] = grid2proc[ime][jme][k]; + k = kme+1; + if (k == procgrid[2]) k = 0; + procneigh[2][1] = grid2proc[ime][jme][k]; + } // set lamda box params after procs are assigned @@ -1179,32 +1195,38 @@ void Comm::reverse_comm_dump(Dump *dump) } /* ---------------------------------------------------------------------- - assign nprocs to 3d xprd,yprd,zprd box so as to minimize surface area - area = surface area of each of 3 faces of simulation box + assign numprocs to 3d box so as to minimize surface area + area = surface area of each of 3 faces of simulation box divided by sx,sy,sz for triclinic, area = cross product of 2 edge vectors stored in h matrix + valid assignment will be factorization of numprocs = Px by Py by Pz + user_factors = if non-zero, factor is specified by user + sx,sy,sz = scale box xyz dimension vy dividing by sx,sy,sz + return factors = # of procs assigned to each dimension + return 1 if successully factor, 0 if not ------------------------------------------------------------------------- */ -void Comm::procs2box() +int Comm::procs2box(int numprocs, int user_factors[3], int factors[3], + const int sx, const int sy, const int sz, int other) { - procgrid[0] = user_procgrid[0]; - procgrid[1] = user_procgrid[1]; - procgrid[2] = user_procgrid[2]; + factors[0] = user_factors[0]; + factors[1] = user_factors[1]; + factors[2] = user_factors[2]; // all 3 proc counts are specified - if (procgrid[0] && procgrid[1] && procgrid[2]) return; + if (factors[0] && factors[1] && factors[2]) return 1; // 2 out of 3 proc counts are specified - if (procgrid[0] > 0 && procgrid[1] > 0) { - procgrid[2] = nprocs/(procgrid[0]*procgrid[1]); - return; - } else if (procgrid[0] > 0 && procgrid[2] > 0) { - procgrid[1] = nprocs/(procgrid[0]*procgrid[2]); - return; - } else if (procgrid[1] > 0 && procgrid[2] > 0) { - procgrid[0] = nprocs/(procgrid[1]*procgrid[2]); - return; + if (factors[0] > 0 && factors[1] > 0) { + factors[2] = nprocs/(factors[0]*factors[1]); + return 1; + } else if (factors[0] > 0 && factors[2] > 0) { + factors[1] = nprocs/(factors[0]*factors[2]); + return 1; + } else if (factors[1] > 0 && factors[2] > 0) { + factors[0] = nprocs/(factors[1]*factors[2]); + return 1; } // determine cross-sectional areas for orthogonal and triclinic boxes @@ -1212,68 +1234,66 @@ void Comm::procs2box() double area[3]; if (domain->triclinic == 0) { - area[0] = domain->xprd * domain->yprd; - area[1] = domain->xprd * domain->zprd; - area[2] = domain->yprd * domain->zprd; + area[0] = domain->xprd * domain->yprd / (sx * sy); + area[1] = domain->xprd * domain->zprd / (sx * sz); + area[2] = domain->yprd * domain->zprd / (sy * sz); } else { double *h = domain->h; - double x,y,z; - cross(h[0],0.0,0.0,h[5],h[1],0.0,x,y,z); - area[0] = sqrt(x*x + y*y + z*z); - cross(h[0],0.0,0.0,h[4],h[3],h[2],x,y,z); - area[1] = sqrt(x*x + y*y + z*z); - cross(h[5],h[1],0.0,h[4],h[3],h[2],x,y,z); - area[2] = sqrt(x*x + y*y + z*z); + double a[3],b[3],c[3]; + a[0] = h[0]; a[1] = 0.0; a[2] = 0.0; + b[0] = h[5]; b[1] = h[1]; b[2] = 0.0; + MathExtra::cross3(a,b,c); + area[0] = sqrt(c[0]*c[0] + c[1]*c[1] + c[2]*c[2]) / (sx * sy); + a[0] = h[0]; a[1] = 0.0; a[2] = 0.0; + b[0] = h[4]; b[1] = h[3]; b[2] = h[2]; + MathExtra::cross3(a,b,c); + area[1] = sqrt(c[0]*c[0] + c[1]*c[1] + c[2]*c[2]) / (sx * sz); + a[0] = h[5]; a[1] = h[1]; a[2] = 0.0; + b[0] = h[4]; b[1] = h[3]; b[2] = h[2]; + MathExtra::cross3(a,b,c); + area[2] = sqrt(c[0]*c[0] + c[1]*c[1] + c[2]*c[2]) / (sy * sz); } double bestsurf = 2.0 * (area[0]+area[1]+area[2]); - // loop thru all possible factorizations of nprocs - // choose factorization with least surface area + // loop thru all possible factorizations of numprocs + // only consider valid cases that match procgrid settings // surf = surface area of a proc sub-domain - // only consider factorizations that match procgrid & other_procgrid settings - // if set, only consider factorizations that match other_procgrid settings - - procgrid[0] = procgrid[1] = procgrid[2] = 0; + // only consider cases that match user_factors & other_procgrid settings + // success = 1 if valid factoriztion is found + // may not be if other constraint is enforced int ipx,ipy,ipz,valid; double surf; - + + int success = 0; ipx = 1; - while (ipx <= nprocs) { + while (ipx <= numprocs) { valid = 1; - if (user_procgrid[0] && ipx != user_procgrid[0]) valid = 0; - if (other_partition_style) { - if (other_partition_style == MULTIPLE && other_procgrid[0] % ipx) - valid = 0; - } - if (nprocs % ipx) valid = 0; + if (user_factors[0] && ipx != user_factors[0]) valid = 0; + if (other == MULTIPLE && other_procgrid[0] % ipx) valid = 0; + if (numprocs % ipx) valid = 0; + if (!valid) { ipx++; continue; } ipy = 1; - while (ipy <= nprocs/ipx) { + while (ipy <= numprocs/ipx) { valid = 1; - if (user_procgrid[1] && ipy != user_procgrid[1]) valid = 0; - if (other_partition_style) { - if (other_partition_style == MULTIPLE && other_procgrid[1] % ipy) - valid = 0; - } - if ((nprocs/ipx) % ipy) valid = 0; + if (user_factors[1] && ipy != user_factors[1]) valid = 0; + if (other == MULTIPLE && other_procgrid[1] % ipy) valid = 0; + if ((numprocs/ipx) % ipy) valid = 0; if (!valid) { ipy++; continue; } - ipz = nprocs/ipx/ipy; + ipz = numprocs/ipx/ipy; valid = 1; - if (user_procgrid[2] && ipz != user_procgrid[2]) valid = 0; - if (other_partition_style) { - if (other_partition_style == MULTIPLE && other_procgrid[2] % ipz) - valid = 0; - } + if (user_factors[2] && ipz != user_factors[2]) valid = 0; + if (other == MULTIPLE && other_procgrid[2] % ipz) valid = 0; if (domain->dimension == 2 && ipz != 1) valid = 0; if (!valid) { ipy++; @@ -1282,10 +1302,11 @@ void Comm::procs2box() surf = area[0]/ipx/ipy + area[1]/ipx/ipz + area[2]/ipy/ipz; if (surf < bestsurf) { + success = 1; bestsurf = surf; - procgrid[0] = ipx; - procgrid[1] = ipy; - procgrid[2] = ipz; + factors[0] = ipx; + factors[1] = ipy; + factors[2] = ipz; } ipy++; } @@ -1293,21 +1314,7 @@ void Comm::procs2box() ipx++; } - if (procgrid[0] == 0) - error->all(FLERR,"Could not layout grid of processors",1); -} - -/* ---------------------------------------------------------------------- - vector cross product: c = a x b -------------------------------------------------------------------------- */ - -void Comm::cross(double ax, double ay, double az, - double bx, double by, double bz, - double &cx, double &cy, double &cz) -{ - cx = ay*bz - az*by; - cy = az*bx - ax*bz; - cz = ax*by - ay*bx; + return success; } /* ---------------------------------------------------------------------- @@ -1433,6 +1440,7 @@ void Comm::free_multi() /* ---------------------------------------------------------------------- set communication style + invoked from input script by communicate command ------------------------------------------------------------------------- */ void Comm::set(int narg, char **arg) @@ -1470,6 +1478,293 @@ void Comm::set(int narg, char **arg) } } +/* ---------------------------------------------------------------------- + set dimensions for 3d grid of processors, and associated flags + invoked from input script by processors command +------------------------------------------------------------------------- */ + +void Comm::set_processors(int narg, char **arg) +{ + if (narg < 3) error->all(FLERR,"Illegal processors command"); + + if (strcmp(arg[0],"*") == 0) user_procgrid[0] = 0; + else user_procgrid[0] = atoi(arg[0]); + if (strcmp(arg[1],"*") == 0) user_procgrid[1] = 0; + else user_procgrid[1] = atoi(arg[1]); + if (strcmp(arg[2],"*") == 0) user_procgrid[2] = 0; + else user_procgrid[2] = atoi(arg[2]); + + if (user_procgrid[0] < 0 || user_procgrid[1] < 0 || user_procgrid[2] < 0) + error->all(FLERR,"Illegal processors command"); + + int iarg = 3; + while (iarg < narg) { + if (strcmp(arg[iarg],"part") == 0) { + if (iarg+4 > narg) error->all(FLERR,"Illegal processors command"); + if (universe->nworlds == 1) + error->all(FLERR, + "Cannot use processors part command " + "without using partitions"); + int isend = atoi(arg[iarg+1]); + int irecv = atoi(arg[iarg+2]); + if (isend < 1 || isend > universe->nworlds || + irecv < 1 || irecv > universe->nworlds || isend == irecv) + error->all(FLERR,"Invalid partitions in processors part command"); + if (isend-1 == universe->iworld) { + if (send_to_partition >= 0) + error->all(FLERR, + "Sending partition in processors part command " + "is already a sender"); + send_to_partition = irecv-1; + } + if (irecv-1 == universe->iworld) { + if (recv_from_partition >= 0) + error->all(FLERR, + "Receiving partition in processors part command " + "is already a receiver"); + recv_from_partition = isend-1; + } + if (strcmp(arg[iarg+3],"multiple") == 0) { + if (universe->iworld == irecv-1) other_partition_style = MULTIPLE; + } else error->all(FLERR,"Illegal processors command"); + iarg += 4; + + } else if (strcmp(arg[iarg],"grid") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal processors command"); + if (strcmp(arg[iarg+1],"cart") == 0) layoutflag = CART; + else if (strcmp(arg[iarg+1],"cart/reorder") == 0) + layoutflag = CARTREORDER; + else if (strcmp(arg[iarg+1],"xyz") == 0) layoutflag = XYZ; + else if (strcmp(arg[iarg+1],"xzy") == 0) layoutflag = XZY; + else if (strcmp(arg[iarg+1],"yxz") == 0) layoutflag = YXZ; + else if (strcmp(arg[iarg+1],"yzx") == 0) layoutflag = YZX; + else if (strcmp(arg[iarg+1],"zxy") == 0) layoutflag = ZXY; + else if (strcmp(arg[iarg+1],"zyx") == 0) layoutflag = ZYX; + else error->all(FLERR,"Illegal processors command"); + iarg += 2; + + } else if (strcmp(arg[iarg],"numa") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal processors command"); + numa_nodes = atoi(arg[iarg+1]); + if (numa_nodes < 0) error->all(FLERR,"Illegal processors command"); + iarg += 2; + + } else error->all(FLERR,"Illegal processors command"); + } +} + +/* ---------------------------------------------------------------------- + setup 3d grid of procs based on box size, group neighbors by numa node + return 1 if successful, return 0 if not +------------------------------------------------------------------------- */ + +int Comm::numa_set_proc_grid() +{ + // get names of all nodes + + int name_length; + char node_name[MPI_MAX_PROCESSOR_NAME]; + char node_names[MPI_MAX_PROCESSOR_NAME*nprocs]; + MPI_Get_processor_name(node_name,&name_length); + MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names, + MPI_MAX_PROCESSOR_NAME,MPI_CHAR,world); + std::string node_string = std::string(node_name); + + // get number of procs per node + + std::map name_map; + std::map::iterator np; + for (int i = 0; i < nprocs; i++) { + std::string i_string = std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]); + np = name_map.find(i_string); + if (np == name_map.end()) + name_map[i_string] = 1; + else + np->second++; + } + int procs_per_node = name_map.begin()->second; + int procs_per_numa = procs_per_node / numa_nodes; + + // use regular mapping if: + + if (procs_per_numa < 4 || // 3 or less procs per numa node + procs_per_node % numa_nodes != 0 || // Different # of procs per numa node + nprocs % procs_per_numa != 0 || // Different # of procs per numa node + nprocs <= procs_per_numa || // Only 1 numa node used + user_procgrid[0] > 1 || // User specified grid dimension + user_procgrid[1] > 1 || // that is greater than 1 + user_procgrid[2] > 1) { // in any dimension + if (me == 0) { + if (screen) fprintf(screen," 1 by 1 by 1 Node grid\n"); + if (logfile) fprintf(logfile," 1 by 1 by 1 Node grid\n"); + } + return 0; + } + + // user settings for the factorization per numa node - currently always zero + + int user_numagrid[3]; + user_numagrid[0] = user_numagrid[1] = user_numagrid[2] = 0; + + // if user specifies 1 for a proc grid dimension, + // also use 1 for the node grid dimension + + if (user_procgrid[0] == 1) user_numagrid[0] = 1; + if (user_procgrid[1] == 1) user_numagrid[1] = 1; + if (user_procgrid[2] == 1) user_numagrid[2] = 1; + + // get an initial factorization for each numa node, + // if the user has not set the number of processors + // can fail (on one partition) if constrained by other_partition_style + + int numagrid[3]; + int flag = procs2box(procs_per_numa,user_numagrid,numagrid, + 1,1,1,other_partition_style); + if (!flag) error->all(FLERR,"Could not layout grid of processors",1); + if (numagrid[0] * numagrid[1] * numagrid[2] != procs_per_numa) + error->all(FLERR,"Bad node grid of processors"); + + // get a factorization for the grid of numa nodes + // should never fail + + int node_count = nprocs / procs_per_numa; + procs2box(node_count,user_procgrid,procgrid, + numagrid[0],numagrid[1],numagrid[2],0); + if (procgrid[0] * procgrid[1] * procgrid[2] != node_count) + error->all(FLERR,"Bad grid of processors"); + + // repeat the numa node factorization using the subdomain sizes + // this will refine the factorization if the user specified the node layout + // should never fail + + procs2box(procs_per_numa,user_numagrid,numagrid, + procgrid[0],procgrid[1],procgrid[2],0); + if (numagrid[0]*numagrid[1]*numagrid[2] != procs_per_numa) + error->all(FLERR,"Bad Node grid of processors"); + if (domain->dimension == 2 && (procgrid[2] != 1 || numagrid[2] != 1)) + error->all(FLERR,"Processor count in z must be 1 for 2d simulation"); + + // assign a unique id to each node + + int node_num = 0, node_id = 0; + for (np = name_map.begin(); np != name_map.end(); ++np) { + if (np->first == node_string) node_id = node_num; + node_num++; + } + + // setup a per node communicator and find rank within + + MPI_Comm node_comm; + MPI_Comm_split(world,node_id,0,&node_comm); + int node_rank; + MPI_Comm_rank(node_comm,&node_rank); + + // setup a per numa communicator and find rank within + + MPI_Comm numa_comm; + int local_numa = node_rank / procs_per_numa; + MPI_Comm_split(node_comm,local_numa,0,&numa_comm); + int numa_rank; + MPI_Comm_rank(numa_comm,&numa_rank); + + // setup a communicator with the rank 0 procs from each numa node + + MPI_Comm numa_leaders; + MPI_Comm_split(world,numa_rank,0,&numa_leaders); + + // use the MPI Cartesian routines to map the nodes to the grid + // could implement layoutflag as in non-NUMA case + + int reorder = 0; + int periods[3]; + periods[0] = periods[1] = periods[2] = 1; + MPI_Comm cartesian; + if (numa_rank == 0) { + MPI_Cart_create(numa_leaders,3,procgrid,periods,reorder,&cartesian); + MPI_Cart_get(cartesian,3,procgrid,periods,myloc); + } + + // broadcast numa node location in grid to other procs in numa node + + MPI_Bcast(myloc,3,MPI_INT,0,numa_comm); + + // get storage for the process mapping + + if (grid2proc) memory->destroy(grid2proc); + memory->create(grid2proc,procgrid[0]*numagrid[0],procgrid[1]*numagrid[1], + procgrid[2]*numagrid[2],"comm:grid2proc"); + + // compute my location within the grid + + int z_offset = numa_rank / (numagrid[0] * numagrid[1]); + int y_offset = (numa_rank % (numagrid[0] * numagrid[1]))/numagrid[0]; + int x_offset = numa_rank % numagrid[0]; + myloc[0] = myloc[0] * numagrid[0] + x_offset; + myloc[1] = myloc[1] * numagrid[1] + y_offset; + myloc[2] = myloc[2] * numagrid[2] + z_offset; + procgrid[0] *= numagrid[0]; + procgrid[1] *= numagrid[1]; + procgrid[2] *= numagrid[2]; + + // allgather of locations to fill grid2proc + + int **gridi; + memory->create(gridi,nprocs,3,"comm:gridi"); + MPI_Allgather(&myloc,3,MPI_INT,gridi[0],3,MPI_INT,world); + for (int i = 0; i < nprocs; i++) + grid2proc[gridi[i][0]][gridi[i][1]][gridi[i][2]] = i; + memory->destroy(gridi); + + // get my neighbors + + int minus, plus; + for (int i = 0; i < 3; i++) { + numa_shift(myloc[i],procgrid[i],minus,plus); + procneigh[i][0] = minus; + procneigh[i][1] = plus; + } + procneigh[0][0] = grid2proc[procneigh[0][0]][myloc[1]][myloc[2]]; + procneigh[0][1] = grid2proc[procneigh[0][1]][myloc[1]][myloc[2]]; + procneigh[1][0] = grid2proc[myloc[0]][procneigh[1][0]][myloc[2]]; + procneigh[1][1] = grid2proc[myloc[0]][procneigh[1][1]][myloc[2]]; + procneigh[2][0] = grid2proc[myloc[0]][myloc[1]][procneigh[2][0]]; + procneigh[2][1] = grid2proc[myloc[0]][myloc[1]][procneigh[2][1]]; + + if (numa_rank == 0) MPI_Comm_free(&cartesian); + MPI_Comm_free(&numa_leaders); + MPI_Comm_free(&numa_comm); + MPI_Comm_free(&node_comm); + + // set lamda box params after procs are assigned + + if (domain->triclinic) domain->set_lamda_box(); + + if (me == 0) { + if (screen) fprintf(screen," %d by %d by %d Node grid\n", + numagrid[0],numagrid[1],numagrid[2]); + if (logfile) fprintf(logfile," %d by %d by %d Node grid\n", + numagrid[0],numagrid[1],numagrid[2]); + if (screen) fprintf(screen," %d by %d by %d processor grid\n", + procgrid[0],procgrid[1],procgrid[2]); + if (logfile) fprintf(logfile," %d by %d by %d processor grid\n", + procgrid[0],procgrid[1],procgrid[2]); + } + + return 1; +} + +/* ---------------------------------------------------------------------- + get the index to the neighboring processors in a dimension +------------------------------------------------------------------------- */ + +void Comm::numa_shift(int myloc, int num_procs, int &minus, int &plus) +{ + minus = myloc - 1; + if (minus < 0) minus = num_procs - 1; + plus = myloc + 1; + if (plus == num_procs) plus = 0; +} + /* ---------------------------------------------------------------------- return # of bytes of allocated memory ------------------------------------------------------------------------- */ diff --git a/src/comm.h b/src/comm.h index 3dd3d9a562..e4b66931b6 100644 --- a/src/comm.h +++ b/src/comm.h @@ -40,7 +40,6 @@ class Comm : protected Pointers { virtual ~Comm(); virtual void init(); - void set_processors(int, char **); // set procs from input script virtual void set_proc_grid(); // setup 3d grid of procs virtual void setup(); // setup 3d comm pattern virtual void forward_comm(int dummy = 0); // forward comm of atom coords @@ -57,7 +56,9 @@ class Comm : protected Pointers { virtual void forward_comm_dump(class Dump *); // forward comm from a Dump virtual void reverse_comm_dump(class Dump *); // reverse comm from a Dump - virtual void set(int, char **); // set communication style + virtual void set(int, char **); // set communication style + void set_processors(int, char **); // set 3d processor grid attributes + virtual bigint memory_usage(); protected: @@ -82,8 +83,10 @@ class Comm : protected Pointers { int comm_x_only,comm_f_only; // 1 if only exchange x,f in for/rev comm int map_style; // non-0 if global->local mapping is done int bordergroup; // only communicate this group in borders + int layoutflag; // user-selected layout for 3d proc grid + int numa_nodes; // layout NUMA-aware 3d proc grid - int other_procgrid[3]; // proc layout + int other_procgrid[3]; // proc layout of another partition int *firstrecv; // where to put 1st recv atom in each swap int **sendlist; // list of atoms to send in each swap @@ -93,11 +96,9 @@ class Comm : protected Pointers { double *buf_recv; // recv buffer for all comm int maxsend,maxrecv; // current size of send/recv buffer int maxforward,maxreverse; // max # of datums in forward/reverse comm - - virtual void procs2box(); // map procs to 3d box - virtual void cross(double, double, double, - double, double, double, - double &, double &, double &); // cross product + + int procs2box(int, int[3], int[3], // map procs to 3d box + const int, const int, const int, int); virtual void grow_send(int,int); // reallocate send buffer virtual void grow_recv(int); // free/allocate recv buffer virtual void grow_list(int, int); // reallocate one sendlist @@ -106,6 +107,9 @@ class Comm : protected Pointers { virtual void allocate_multi(int); // allocate multi arrays virtual void free_swap(); // free swap arrays virtual void free_multi(); // free multi arrays + + int numa_set_proc_grid(); + void numa_shift(int, int, int &, int &); }; } diff --git a/src/math_extra.h b/src/math_extra.h index a5fdc81ceb..c161b40f15 100755 --- a/src/math_extra.h +++ b/src/math_extra.h @@ -233,9 +233,9 @@ double MathExtra::dot3(const double *v1, const double *v2) void MathExtra::cross3(const double *v1, const double *v2, double *ans) { - ans[0] = v1[1]*v2[2]-v1[2]*v2[1]; - ans[1] = v1[2]*v2[0]-v1[0]*v2[2]; - ans[2] = v1[0]*v2[1]-v1[1]*v2[0]; + ans[0] = v1[1]*v2[2] - v1[2]*v2[1]; + ans[1] = v1[2]*v2[0] - v1[0]*v2[2]; + ans[2] = v1[0]*v2[1] - v1[1]*v2[0]; } /* ----------------------------------------------------------------------