forked from lijiext/lammps
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7314 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
e08627bdc8
commit
e1f70c3a98
|
@ -415,8 +415,8 @@ int MPI_Gather(void *sendbuf, int sendcount, MPI_Datatype sendtype,
|
|||
/* copy values from data1 to data2 */
|
||||
|
||||
int MPI_Gatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype,
|
||||
void *recvbuf, int *recvcounts, int *displs,
|
||||
MPI_Datatype recvtype, int root, MPI_Comm comm)
|
||||
void *recvbuf, int *recvcounts, int *displs,
|
||||
MPI_Datatype recvtype, int root, MPI_Comm comm)
|
||||
{
|
||||
int n;
|
||||
if (sendtype == MPI_INT) n = sendcount*sizeof(int);
|
||||
|
@ -430,3 +430,24 @@ int MPI_Gatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype,
|
|||
memcpy(recvbuf,sendbuf,n);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
/* copy values from data1 to data2 */
|
||||
|
||||
int MPI_Scatterv(void *sendbuf, int *sendcounts, int *displs,
|
||||
MPI_Datatype sendtype, void *recvbuf, int recvcount,
|
||||
MPI_Datatype recvtype, int root, MPI_Comm comm)
|
||||
{
|
||||
int n;
|
||||
if (sendtype == MPI_INT) n = recvcount*sizeof(int);
|
||||
else if (sendtype == MPI_FLOAT) n = recvcount*sizeof(float);
|
||||
else if (sendtype == MPI_DOUBLE) n = recvcount*sizeof(double);
|
||||
else if (sendtype == MPI_CHAR) n = recvcount*sizeof(char);
|
||||
else if (sendtype == MPI_BYTE) n = recvcount*sizeof(char);
|
||||
else if (sendtype == MPI_LONG_LONG) n = recvcount*sizeof(uint64_t);
|
||||
else if (sendtype == MPI_DOUBLE_INT) n = recvcount*sizeof(double_int);
|
||||
|
||||
memcpy(recvbuf,sendbuf,n);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -115,7 +115,10 @@ int MPI_Gather(void *sendbuf, int sendcount, MPI_Datatype sendtype,
|
|||
void *recvbuf, int recvcount, MPI_Datatype recvtype,
|
||||
int root, MPI_Comm comm);
|
||||
int MPI_Gatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype,
|
||||
void *recvbuf, int *recvcounts, int *displs,
|
||||
MPI_Datatype recvtype, int root, MPI_Comm comm);
|
||||
void *recvbuf, int *recvcounts, int *displs,
|
||||
MPI_Datatype recvtype, int root, MPI_Comm comm);
|
||||
int MPI_Scatterv(void *sendbuf, int *sendcounts, int *displs,
|
||||
MPI_Datatype sendtype, void *recvbuf, int recvcount,
|
||||
MPI_Datatype recvtype, int root, MPI_Comm comm);
|
||||
|
||||
#endif
|
||||
|
|
593
src/comm.cpp
593
src/comm.cpp
|
@ -35,9 +35,13 @@
|
|||
#include "compute.h"
|
||||
#include "output.h"
|
||||
#include "dump.h"
|
||||
#include "math_extra.h"
|
||||
#include "error.h"
|
||||
#include "memory.h"
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
#ifdef _OPENMP
|
||||
#include "omp.h"
|
||||
#endif
|
||||
|
@ -51,6 +55,7 @@ using namespace LAMMPS_NS;
|
|||
|
||||
enum{SINGLE,MULTI};
|
||||
enum{NONE,MULTIPLE};
|
||||
enum{CART,CARTREORDER,XYZ,XZY,YXZ,YZX,ZXY,ZYX};
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
setup MPI and allocate buffer space
|
||||
|
@ -63,6 +68,8 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
|
|||
|
||||
user_procgrid[0] = user_procgrid[1] = user_procgrid[2] = 0;
|
||||
grid2proc = NULL;
|
||||
layoutflag = CART;
|
||||
numa_nodes = 0;
|
||||
|
||||
bordergroup = 0;
|
||||
style = SINGLE;
|
||||
|
@ -131,61 +138,7 @@ Comm::~Comm()
|
|||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
set dimensions of processor grid
|
||||
invoked from input script by processors command
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void Comm::set_processors(int narg, char **arg)
|
||||
{
|
||||
if (narg < 3) error->all(FLERR,"Illegal processors command");
|
||||
|
||||
if (strcmp(arg[0],"*") == 0) user_procgrid[0] = 0;
|
||||
else user_procgrid[0] = atoi(arg[0]);
|
||||
if (strcmp(arg[1],"*") == 0) user_procgrid[1] = 0;
|
||||
else user_procgrid[1] = atoi(arg[1]);
|
||||
if (strcmp(arg[2],"*") == 0) user_procgrid[2] = 0;
|
||||
else user_procgrid[2] = atoi(arg[2]);
|
||||
|
||||
if (user_procgrid[0] < 0 || user_procgrid[1] < 0 || user_procgrid[2] < 0)
|
||||
error->all(FLERR,"Illegal processors command");
|
||||
|
||||
int iarg = 3;
|
||||
while (iarg < narg) {
|
||||
if (strcmp(arg[iarg],"part") == 0) {
|
||||
if (iarg+4 > narg) error->all(FLERR,"Illegal processors command");
|
||||
if (universe->nworlds == 1)
|
||||
error->all(FLERR,
|
||||
"Cannot use processors part command "
|
||||
"without using partitions");
|
||||
int isend = atoi(arg[iarg+1]);
|
||||
int irecv = atoi(arg[iarg+2]);
|
||||
if (isend < 1 || isend > universe->nworlds ||
|
||||
irecv < 1 || irecv > universe->nworlds || isend == irecv)
|
||||
error->all(FLERR,"Invalid partitions in processors part command");
|
||||
if (isend-1 == universe->iworld) {
|
||||
if (send_to_partition >= 0)
|
||||
error->all(FLERR,
|
||||
"Sending partition in processors part command "
|
||||
"is already a sender");
|
||||
send_to_partition = irecv-1;
|
||||
}
|
||||
if (irecv-1 == universe->iworld) {
|
||||
if (recv_from_partition >= 0)
|
||||
error->all(FLERR,
|
||||
"Receiving partition in processors part command "
|
||||
"is already a receiver");
|
||||
recv_from_partition = isend-1;
|
||||
}
|
||||
if (strcmp(arg[iarg+3],"multiple") == 0) {
|
||||
if (universe->iworld == irecv-1) other_partition_style = MULTIPLE;
|
||||
} else error->all(FLERR,"Illegal processors command");
|
||||
iarg += 4;
|
||||
} else error->all(FLERR,"Illegal processors command");
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
setup 3d grid of procs based on box size
|
||||
create 3d grid of procs based on box size
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void Comm::set_proc_grid()
|
||||
|
@ -200,9 +153,18 @@ void Comm::set_proc_grid()
|
|||
MPI_Bcast(other_procgrid,3,MPI_INT,0,world);
|
||||
}
|
||||
|
||||
// create layout of procs mapped to simulation box
|
||||
// use NUMA routines if numa_nodes > 0
|
||||
// if NUMA routines fail, just continue
|
||||
|
||||
procs2box();
|
||||
if (numa_nodes)
|
||||
if (numa_set_proc_grid()) return;
|
||||
|
||||
// create layout of procs mapped to simulation box
|
||||
// can fail (on one partition) if constrained by other_partition_style
|
||||
|
||||
int flag =
|
||||
procs2box(nprocs,user_procgrid,procgrid,1,1,1,other_partition_style);
|
||||
if (!flag) error->all(FLERR,"Could not layout grid of processors",1);
|
||||
|
||||
// error check
|
||||
|
||||
|
@ -211,35 +173,89 @@ void Comm::set_proc_grid()
|
|||
if (domain->dimension == 2 && procgrid[2] != 1)
|
||||
error->all(FLERR,"Processor count in z must be 1 for 2d simulation");
|
||||
|
||||
// grid2proc[i][j][k] = proc that owns i,j,k location in grid
|
||||
|
||||
if (grid2proc) memory->destroy(grid2proc);
|
||||
memory->create(grid2proc,procgrid[0],procgrid[1],procgrid[2],
|
||||
"comm:grid2proc");
|
||||
|
||||
// use MPI Cartesian routines to setup 3d grid of procs
|
||||
// grid2proc[i][j][k] = proc that owns i,j,k location in grid
|
||||
// let MPI compute it instead of LAMMPS in case it is machine optimized
|
||||
// use MPI Cartesian routines to layout 3d grid of procs
|
||||
// MPI may do layout in machine-optimized fashion
|
||||
|
||||
int reorder = 0;
|
||||
int periods[3];
|
||||
periods[0] = periods[1] = periods[2] = 1;
|
||||
MPI_Comm cartesian;
|
||||
if (layoutflag == CART || layoutflag == CARTREORDER) {
|
||||
int reorder = 0;
|
||||
if (layoutflag == CARTREORDER) reorder = 1;
|
||||
int periods[3];
|
||||
periods[0] = periods[1] = periods[2] = 1;
|
||||
MPI_Comm cartesian;
|
||||
|
||||
MPI_Cart_create(world,3,procgrid,periods,reorder,&cartesian);
|
||||
MPI_Cart_get(cartesian,3,procgrid,periods,myloc);
|
||||
MPI_Cart_shift(cartesian,0,1,&procneigh[0][0],&procneigh[0][1]);
|
||||
MPI_Cart_shift(cartesian,1,1,&procneigh[1][0],&procneigh[1][1]);
|
||||
MPI_Cart_shift(cartesian,2,1,&procneigh[2][0],&procneigh[2][1]);
|
||||
MPI_Cart_create(world,3,procgrid,periods,reorder,&cartesian);
|
||||
MPI_Cart_get(cartesian,3,procgrid,periods,myloc);
|
||||
MPI_Cart_shift(cartesian,0,1,&procneigh[0][0],&procneigh[0][1]);
|
||||
MPI_Cart_shift(cartesian,1,1,&procneigh[1][0],&procneigh[1][1]);
|
||||
MPI_Cart_shift(cartesian,2,1,&procneigh[2][0],&procneigh[2][1]);
|
||||
|
||||
int coords[3];
|
||||
int i,j,k;
|
||||
for (i = 0; i < procgrid[0]; i++)
|
||||
for (j = 0; j < procgrid[1]; j++)
|
||||
for (k = 0; k < procgrid[2]; k++) {
|
||||
coords[0] = i; coords[1] = j; coords[2] = k;
|
||||
MPI_Cart_rank(cartesian,coords,&grid2proc[i][j][k]);
|
||||
}
|
||||
int coords[3];
|
||||
int i,j,k;
|
||||
for (i = 0; i < procgrid[0]; i++)
|
||||
for (j = 0; j < procgrid[1]; j++)
|
||||
for (k = 0; k < procgrid[2]; k++) {
|
||||
coords[0] = i; coords[1] = j; coords[2] = k;
|
||||
MPI_Cart_rank(cartesian,coords,&grid2proc[i][j][k]);
|
||||
}
|
||||
|
||||
MPI_Comm_free(&cartesian);
|
||||
|
||||
MPI_Comm_free(&cartesian);
|
||||
// layout 3d grid of procs explicitly, via user-requested XYZ ordering
|
||||
|
||||
} else {
|
||||
int i,j,k,ime,jme,kme;
|
||||
for (i = 0; i < procgrid[0]; i++)
|
||||
for (j = 0; j < procgrid[1]; j++)
|
||||
for (k = 0; k < procgrid[2]; k++) {
|
||||
if (layoutflag == XYZ)
|
||||
grid2proc[i][j][k] = k*procgrid[1]*procgrid[0] + j*procgrid[0] + i;
|
||||
else if (layoutflag == XZY)
|
||||
grid2proc[i][j][k] = j*procgrid[2]*procgrid[0] + k*procgrid[0] + i;
|
||||
else if (layoutflag == YXZ)
|
||||
grid2proc[i][j][k] = k*procgrid[0]*procgrid[1] + i*procgrid[1] + j;
|
||||
else if (layoutflag == YZX)
|
||||
grid2proc[i][j][k] = i*procgrid[2]*procgrid[1] + k*procgrid[1] + j;
|
||||
else if (layoutflag == ZXY)
|
||||
grid2proc[i][j][k] = j*procgrid[0]*procgrid[2] + i*procgrid[2] + k;
|
||||
else if (layoutflag == ZYX)
|
||||
grid2proc[i][j][k] = i*procgrid[1]*procgrid[2] + j*procgrid[2] + k;
|
||||
|
||||
if (grid2proc[i][j][k] == me) {
|
||||
ime = i; jme = j, kme = k;
|
||||
}
|
||||
}
|
||||
|
||||
myloc[0] = ime;
|
||||
myloc[1] = jme;
|
||||
myloc[2] = kme;
|
||||
|
||||
i = ime-1;
|
||||
if (i < 0) i = procgrid[0]-1;
|
||||
procneigh[0][0] = grid2proc[i][jme][kme];
|
||||
i = ime+1;
|
||||
if (i == procgrid[0]) i = 0;
|
||||
procneigh[0][1] = grid2proc[i][jme][kme];
|
||||
|
||||
j = jme-1;
|
||||
if (j < 0) j = procgrid[1]-1;
|
||||
procneigh[1][0] = grid2proc[ime][j][kme];
|
||||
j = jme+1;
|
||||
if (j == procgrid[1]) j = 0;
|
||||
procneigh[1][1] = grid2proc[ime][j][kme];
|
||||
|
||||
k = kme-1;
|
||||
if (k < 0) k = procgrid[2]-1;
|
||||
procneigh[2][0] = grid2proc[ime][jme][k];
|
||||
k = kme+1;
|
||||
if (k == procgrid[2]) k = 0;
|
||||
procneigh[2][1] = grid2proc[ime][jme][k];
|
||||
}
|
||||
|
||||
// set lamda box params after procs are assigned
|
||||
|
||||
|
@ -1179,32 +1195,38 @@ void Comm::reverse_comm_dump(Dump *dump)
|
|||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
assign nprocs to 3d xprd,yprd,zprd box so as to minimize surface area
|
||||
area = surface area of each of 3 faces of simulation box
|
||||
assign numprocs to 3d box so as to minimize surface area
|
||||
area = surface area of each of 3 faces of simulation box divided by sx,sy,sz
|
||||
for triclinic, area = cross product of 2 edge vectors stored in h matrix
|
||||
valid assignment will be factorization of numprocs = Px by Py by Pz
|
||||
user_factors = if non-zero, factor is specified by user
|
||||
sx,sy,sz = scale box xyz dimension vy dividing by sx,sy,sz
|
||||
return factors = # of procs assigned to each dimension
|
||||
return 1 if successully factor, 0 if not
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void Comm::procs2box()
|
||||
int Comm::procs2box(int numprocs, int user_factors[3], int factors[3],
|
||||
const int sx, const int sy, const int sz, int other)
|
||||
{
|
||||
procgrid[0] = user_procgrid[0];
|
||||
procgrid[1] = user_procgrid[1];
|
||||
procgrid[2] = user_procgrid[2];
|
||||
factors[0] = user_factors[0];
|
||||
factors[1] = user_factors[1];
|
||||
factors[2] = user_factors[2];
|
||||
|
||||
// all 3 proc counts are specified
|
||||
|
||||
if (procgrid[0] && procgrid[1] && procgrid[2]) return;
|
||||
if (factors[0] && factors[1] && factors[2]) return 1;
|
||||
|
||||
// 2 out of 3 proc counts are specified
|
||||
|
||||
if (procgrid[0] > 0 && procgrid[1] > 0) {
|
||||
procgrid[2] = nprocs/(procgrid[0]*procgrid[1]);
|
||||
return;
|
||||
} else if (procgrid[0] > 0 && procgrid[2] > 0) {
|
||||
procgrid[1] = nprocs/(procgrid[0]*procgrid[2]);
|
||||
return;
|
||||
} else if (procgrid[1] > 0 && procgrid[2] > 0) {
|
||||
procgrid[0] = nprocs/(procgrid[1]*procgrid[2]);
|
||||
return;
|
||||
if (factors[0] > 0 && factors[1] > 0) {
|
||||
factors[2] = nprocs/(factors[0]*factors[1]);
|
||||
return 1;
|
||||
} else if (factors[0] > 0 && factors[2] > 0) {
|
||||
factors[1] = nprocs/(factors[0]*factors[2]);
|
||||
return 1;
|
||||
} else if (factors[1] > 0 && factors[2] > 0) {
|
||||
factors[0] = nprocs/(factors[1]*factors[2]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// determine cross-sectional areas for orthogonal and triclinic boxes
|
||||
|
@ -1212,68 +1234,66 @@ void Comm::procs2box()
|
|||
|
||||
double area[3];
|
||||
if (domain->triclinic == 0) {
|
||||
area[0] = domain->xprd * domain->yprd;
|
||||
area[1] = domain->xprd * domain->zprd;
|
||||
area[2] = domain->yprd * domain->zprd;
|
||||
area[0] = domain->xprd * domain->yprd / (sx * sy);
|
||||
area[1] = domain->xprd * domain->zprd / (sx * sz);
|
||||
area[2] = domain->yprd * domain->zprd / (sy * sz);
|
||||
} else {
|
||||
double *h = domain->h;
|
||||
double x,y,z;
|
||||
cross(h[0],0.0,0.0,h[5],h[1],0.0,x,y,z);
|
||||
area[0] = sqrt(x*x + y*y + z*z);
|
||||
cross(h[0],0.0,0.0,h[4],h[3],h[2],x,y,z);
|
||||
area[1] = sqrt(x*x + y*y + z*z);
|
||||
cross(h[5],h[1],0.0,h[4],h[3],h[2],x,y,z);
|
||||
area[2] = sqrt(x*x + y*y + z*z);
|
||||
double a[3],b[3],c[3];
|
||||
a[0] = h[0]; a[1] = 0.0; a[2] = 0.0;
|
||||
b[0] = h[5]; b[1] = h[1]; b[2] = 0.0;
|
||||
MathExtra::cross3(a,b,c);
|
||||
area[0] = sqrt(c[0]*c[0] + c[1]*c[1] + c[2]*c[2]) / (sx * sy);
|
||||
a[0] = h[0]; a[1] = 0.0; a[2] = 0.0;
|
||||
b[0] = h[4]; b[1] = h[3]; b[2] = h[2];
|
||||
MathExtra::cross3(a,b,c);
|
||||
area[1] = sqrt(c[0]*c[0] + c[1]*c[1] + c[2]*c[2]) / (sx * sz);
|
||||
a[0] = h[5]; a[1] = h[1]; a[2] = 0.0;
|
||||
b[0] = h[4]; b[1] = h[3]; b[2] = h[2];
|
||||
MathExtra::cross3(a,b,c);
|
||||
area[2] = sqrt(c[0]*c[0] + c[1]*c[1] + c[2]*c[2]) / (sy * sz);
|
||||
}
|
||||
|
||||
double bestsurf = 2.0 * (area[0]+area[1]+area[2]);
|
||||
|
||||
// loop thru all possible factorizations of nprocs
|
||||
// choose factorization with least surface area
|
||||
// loop thru all possible factorizations of numprocs
|
||||
// only consider valid cases that match procgrid settings
|
||||
// surf = surface area of a proc sub-domain
|
||||
// only consider factorizations that match procgrid & other_procgrid settings
|
||||
// if set, only consider factorizations that match other_procgrid settings
|
||||
|
||||
procgrid[0] = procgrid[1] = procgrid[2] = 0;
|
||||
// only consider cases that match user_factors & other_procgrid settings
|
||||
// success = 1 if valid factoriztion is found
|
||||
// may not be if other constraint is enforced
|
||||
|
||||
int ipx,ipy,ipz,valid;
|
||||
double surf;
|
||||
|
||||
|
||||
int success = 0;
|
||||
ipx = 1;
|
||||
while (ipx <= nprocs) {
|
||||
while (ipx <= numprocs) {
|
||||
valid = 1;
|
||||
if (user_procgrid[0] && ipx != user_procgrid[0]) valid = 0;
|
||||
if (other_partition_style) {
|
||||
if (other_partition_style == MULTIPLE && other_procgrid[0] % ipx)
|
||||
valid = 0;
|
||||
}
|
||||
if (nprocs % ipx) valid = 0;
|
||||
if (user_factors[0] && ipx != user_factors[0]) valid = 0;
|
||||
if (other == MULTIPLE && other_procgrid[0] % ipx) valid = 0;
|
||||
if (numprocs % ipx) valid = 0;
|
||||
|
||||
if (!valid) {
|
||||
ipx++;
|
||||
continue;
|
||||
}
|
||||
|
||||
ipy = 1;
|
||||
while (ipy <= nprocs/ipx) {
|
||||
while (ipy <= numprocs/ipx) {
|
||||
valid = 1;
|
||||
if (user_procgrid[1] && ipy != user_procgrid[1]) valid = 0;
|
||||
if (other_partition_style) {
|
||||
if (other_partition_style == MULTIPLE && other_procgrid[1] % ipy)
|
||||
valid = 0;
|
||||
}
|
||||
if ((nprocs/ipx) % ipy) valid = 0;
|
||||
if (user_factors[1] && ipy != user_factors[1]) valid = 0;
|
||||
if (other == MULTIPLE && other_procgrid[1] % ipy) valid = 0;
|
||||
if ((numprocs/ipx) % ipy) valid = 0;
|
||||
if (!valid) {
|
||||
ipy++;
|
||||
continue;
|
||||
}
|
||||
|
||||
ipz = nprocs/ipx/ipy;
|
||||
ipz = numprocs/ipx/ipy;
|
||||
valid = 1;
|
||||
if (user_procgrid[2] && ipz != user_procgrid[2]) valid = 0;
|
||||
if (other_partition_style) {
|
||||
if (other_partition_style == MULTIPLE && other_procgrid[2] % ipz)
|
||||
valid = 0;
|
||||
}
|
||||
if (user_factors[2] && ipz != user_factors[2]) valid = 0;
|
||||
if (other == MULTIPLE && other_procgrid[2] % ipz) valid = 0;
|
||||
if (domain->dimension == 2 && ipz != 1) valid = 0;
|
||||
if (!valid) {
|
||||
ipy++;
|
||||
|
@ -1282,10 +1302,11 @@ void Comm::procs2box()
|
|||
|
||||
surf = area[0]/ipx/ipy + area[1]/ipx/ipz + area[2]/ipy/ipz;
|
||||
if (surf < bestsurf) {
|
||||
success = 1;
|
||||
bestsurf = surf;
|
||||
procgrid[0] = ipx;
|
||||
procgrid[1] = ipy;
|
||||
procgrid[2] = ipz;
|
||||
factors[0] = ipx;
|
||||
factors[1] = ipy;
|
||||
factors[2] = ipz;
|
||||
}
|
||||
ipy++;
|
||||
}
|
||||
|
@ -1293,21 +1314,7 @@ void Comm::procs2box()
|
|||
ipx++;
|
||||
}
|
||||
|
||||
if (procgrid[0] == 0)
|
||||
error->all(FLERR,"Could not layout grid of processors",1);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
vector cross product: c = a x b
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void Comm::cross(double ax, double ay, double az,
|
||||
double bx, double by, double bz,
|
||||
double &cx, double &cy, double &cz)
|
||||
{
|
||||
cx = ay*bz - az*by;
|
||||
cy = az*bx - ax*bz;
|
||||
cz = ax*by - ay*bx;
|
||||
return success;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
|
@ -1433,6 +1440,7 @@ void Comm::free_multi()
|
|||
|
||||
/* ----------------------------------------------------------------------
|
||||
set communication style
|
||||
invoked from input script by communicate command
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void Comm::set(int narg, char **arg)
|
||||
|
@ -1470,6 +1478,293 @@ void Comm::set(int narg, char **arg)
|
|||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
set dimensions for 3d grid of processors, and associated flags
|
||||
invoked from input script by processors command
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void Comm::set_processors(int narg, char **arg)
|
||||
{
|
||||
if (narg < 3) error->all(FLERR,"Illegal processors command");
|
||||
|
||||
if (strcmp(arg[0],"*") == 0) user_procgrid[0] = 0;
|
||||
else user_procgrid[0] = atoi(arg[0]);
|
||||
if (strcmp(arg[1],"*") == 0) user_procgrid[1] = 0;
|
||||
else user_procgrid[1] = atoi(arg[1]);
|
||||
if (strcmp(arg[2],"*") == 0) user_procgrid[2] = 0;
|
||||
else user_procgrid[2] = atoi(arg[2]);
|
||||
|
||||
if (user_procgrid[0] < 0 || user_procgrid[1] < 0 || user_procgrid[2] < 0)
|
||||
error->all(FLERR,"Illegal processors command");
|
||||
|
||||
int iarg = 3;
|
||||
while (iarg < narg) {
|
||||
if (strcmp(arg[iarg],"part") == 0) {
|
||||
if (iarg+4 > narg) error->all(FLERR,"Illegal processors command");
|
||||
if (universe->nworlds == 1)
|
||||
error->all(FLERR,
|
||||
"Cannot use processors part command "
|
||||
"without using partitions");
|
||||
int isend = atoi(arg[iarg+1]);
|
||||
int irecv = atoi(arg[iarg+2]);
|
||||
if (isend < 1 || isend > universe->nworlds ||
|
||||
irecv < 1 || irecv > universe->nworlds || isend == irecv)
|
||||
error->all(FLERR,"Invalid partitions in processors part command");
|
||||
if (isend-1 == universe->iworld) {
|
||||
if (send_to_partition >= 0)
|
||||
error->all(FLERR,
|
||||
"Sending partition in processors part command "
|
||||
"is already a sender");
|
||||
send_to_partition = irecv-1;
|
||||
}
|
||||
if (irecv-1 == universe->iworld) {
|
||||
if (recv_from_partition >= 0)
|
||||
error->all(FLERR,
|
||||
"Receiving partition in processors part command "
|
||||
"is already a receiver");
|
||||
recv_from_partition = isend-1;
|
||||
}
|
||||
if (strcmp(arg[iarg+3],"multiple") == 0) {
|
||||
if (universe->iworld == irecv-1) other_partition_style = MULTIPLE;
|
||||
} else error->all(FLERR,"Illegal processors command");
|
||||
iarg += 4;
|
||||
|
||||
} else if (strcmp(arg[iarg],"grid") == 0) {
|
||||
if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
|
||||
if (strcmp(arg[iarg+1],"cart") == 0) layoutflag = CART;
|
||||
else if (strcmp(arg[iarg+1],"cart/reorder") == 0)
|
||||
layoutflag = CARTREORDER;
|
||||
else if (strcmp(arg[iarg+1],"xyz") == 0) layoutflag = XYZ;
|
||||
else if (strcmp(arg[iarg+1],"xzy") == 0) layoutflag = XZY;
|
||||
else if (strcmp(arg[iarg+1],"yxz") == 0) layoutflag = YXZ;
|
||||
else if (strcmp(arg[iarg+1],"yzx") == 0) layoutflag = YZX;
|
||||
else if (strcmp(arg[iarg+1],"zxy") == 0) layoutflag = ZXY;
|
||||
else if (strcmp(arg[iarg+1],"zyx") == 0) layoutflag = ZYX;
|
||||
else error->all(FLERR,"Illegal processors command");
|
||||
iarg += 2;
|
||||
|
||||
} else if (strcmp(arg[iarg],"numa") == 0) {
|
||||
if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
|
||||
numa_nodes = atoi(arg[iarg+1]);
|
||||
if (numa_nodes < 0) error->all(FLERR,"Illegal processors command");
|
||||
iarg += 2;
|
||||
|
||||
} else error->all(FLERR,"Illegal processors command");
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
setup 3d grid of procs based on box size, group neighbors by numa node
|
||||
return 1 if successful, return 0 if not
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
int Comm::numa_set_proc_grid()
|
||||
{
|
||||
// get names of all nodes
|
||||
|
||||
int name_length;
|
||||
char node_name[MPI_MAX_PROCESSOR_NAME];
|
||||
char node_names[MPI_MAX_PROCESSOR_NAME*nprocs];
|
||||
MPI_Get_processor_name(node_name,&name_length);
|
||||
MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names,
|
||||
MPI_MAX_PROCESSOR_NAME,MPI_CHAR,world);
|
||||
std::string node_string = std::string(node_name);
|
||||
|
||||
// get number of procs per node
|
||||
|
||||
std::map<std::string,int> name_map;
|
||||
std::map<std::string,int>::iterator np;
|
||||
for (int i = 0; i < nprocs; i++) {
|
||||
std::string i_string = std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]);
|
||||
np = name_map.find(i_string);
|
||||
if (np == name_map.end())
|
||||
name_map[i_string] = 1;
|
||||
else
|
||||
np->second++;
|
||||
}
|
||||
int procs_per_node = name_map.begin()->second;
|
||||
int procs_per_numa = procs_per_node / numa_nodes;
|
||||
|
||||
// use regular mapping if:
|
||||
|
||||
if (procs_per_numa < 4 || // 3 or less procs per numa node
|
||||
procs_per_node % numa_nodes != 0 || // Different # of procs per numa node
|
||||
nprocs % procs_per_numa != 0 || // Different # of procs per numa node
|
||||
nprocs <= procs_per_numa || // Only 1 numa node used
|
||||
user_procgrid[0] > 1 || // User specified grid dimension
|
||||
user_procgrid[1] > 1 || // that is greater than 1
|
||||
user_procgrid[2] > 1) { // in any dimension
|
||||
if (me == 0) {
|
||||
if (screen) fprintf(screen," 1 by 1 by 1 Node grid\n");
|
||||
if (logfile) fprintf(logfile," 1 by 1 by 1 Node grid\n");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// user settings for the factorization per numa node - currently always zero
|
||||
|
||||
int user_numagrid[3];
|
||||
user_numagrid[0] = user_numagrid[1] = user_numagrid[2] = 0;
|
||||
|
||||
// if user specifies 1 for a proc grid dimension,
|
||||
// also use 1 for the node grid dimension
|
||||
|
||||
if (user_procgrid[0] == 1) user_numagrid[0] = 1;
|
||||
if (user_procgrid[1] == 1) user_numagrid[1] = 1;
|
||||
if (user_procgrid[2] == 1) user_numagrid[2] = 1;
|
||||
|
||||
// get an initial factorization for each numa node,
|
||||
// if the user has not set the number of processors
|
||||
// can fail (on one partition) if constrained by other_partition_style
|
||||
|
||||
int numagrid[3];
|
||||
int flag = procs2box(procs_per_numa,user_numagrid,numagrid,
|
||||
1,1,1,other_partition_style);
|
||||
if (!flag) error->all(FLERR,"Could not layout grid of processors",1);
|
||||
if (numagrid[0] * numagrid[1] * numagrid[2] != procs_per_numa)
|
||||
error->all(FLERR,"Bad node grid of processors");
|
||||
|
||||
// get a factorization for the grid of numa nodes
|
||||
// should never fail
|
||||
|
||||
int node_count = nprocs / procs_per_numa;
|
||||
procs2box(node_count,user_procgrid,procgrid,
|
||||
numagrid[0],numagrid[1],numagrid[2],0);
|
||||
if (procgrid[0] * procgrid[1] * procgrid[2] != node_count)
|
||||
error->all(FLERR,"Bad grid of processors");
|
||||
|
||||
// repeat the numa node factorization using the subdomain sizes
|
||||
// this will refine the factorization if the user specified the node layout
|
||||
// should never fail
|
||||
|
||||
procs2box(procs_per_numa,user_numagrid,numagrid,
|
||||
procgrid[0],procgrid[1],procgrid[2],0);
|
||||
if (numagrid[0]*numagrid[1]*numagrid[2] != procs_per_numa)
|
||||
error->all(FLERR,"Bad Node grid of processors");
|
||||
if (domain->dimension == 2 && (procgrid[2] != 1 || numagrid[2] != 1))
|
||||
error->all(FLERR,"Processor count in z must be 1 for 2d simulation");
|
||||
|
||||
// assign a unique id to each node
|
||||
|
||||
int node_num = 0, node_id = 0;
|
||||
for (np = name_map.begin(); np != name_map.end(); ++np) {
|
||||
if (np->first == node_string) node_id = node_num;
|
||||
node_num++;
|
||||
}
|
||||
|
||||
// setup a per node communicator and find rank within
|
||||
|
||||
MPI_Comm node_comm;
|
||||
MPI_Comm_split(world,node_id,0,&node_comm);
|
||||
int node_rank;
|
||||
MPI_Comm_rank(node_comm,&node_rank);
|
||||
|
||||
// setup a per numa communicator and find rank within
|
||||
|
||||
MPI_Comm numa_comm;
|
||||
int local_numa = node_rank / procs_per_numa;
|
||||
MPI_Comm_split(node_comm,local_numa,0,&numa_comm);
|
||||
int numa_rank;
|
||||
MPI_Comm_rank(numa_comm,&numa_rank);
|
||||
|
||||
// setup a communicator with the rank 0 procs from each numa node
|
||||
|
||||
MPI_Comm numa_leaders;
|
||||
MPI_Comm_split(world,numa_rank,0,&numa_leaders);
|
||||
|
||||
// use the MPI Cartesian routines to map the nodes to the grid
|
||||
// could implement layoutflag as in non-NUMA case
|
||||
|
||||
int reorder = 0;
|
||||
int periods[3];
|
||||
periods[0] = periods[1] = periods[2] = 1;
|
||||
MPI_Comm cartesian;
|
||||
if (numa_rank == 0) {
|
||||
MPI_Cart_create(numa_leaders,3,procgrid,periods,reorder,&cartesian);
|
||||
MPI_Cart_get(cartesian,3,procgrid,periods,myloc);
|
||||
}
|
||||
|
||||
// broadcast numa node location in grid to other procs in numa node
|
||||
|
||||
MPI_Bcast(myloc,3,MPI_INT,0,numa_comm);
|
||||
|
||||
// get storage for the process mapping
|
||||
|
||||
if (grid2proc) memory->destroy(grid2proc);
|
||||
memory->create(grid2proc,procgrid[0]*numagrid[0],procgrid[1]*numagrid[1],
|
||||
procgrid[2]*numagrid[2],"comm:grid2proc");
|
||||
|
||||
// compute my location within the grid
|
||||
|
||||
int z_offset = numa_rank / (numagrid[0] * numagrid[1]);
|
||||
int y_offset = (numa_rank % (numagrid[0] * numagrid[1]))/numagrid[0];
|
||||
int x_offset = numa_rank % numagrid[0];
|
||||
myloc[0] = myloc[0] * numagrid[0] + x_offset;
|
||||
myloc[1] = myloc[1] * numagrid[1] + y_offset;
|
||||
myloc[2] = myloc[2] * numagrid[2] + z_offset;
|
||||
procgrid[0] *= numagrid[0];
|
||||
procgrid[1] *= numagrid[1];
|
||||
procgrid[2] *= numagrid[2];
|
||||
|
||||
// allgather of locations to fill grid2proc
|
||||
|
||||
int **gridi;
|
||||
memory->create(gridi,nprocs,3,"comm:gridi");
|
||||
MPI_Allgather(&myloc,3,MPI_INT,gridi[0],3,MPI_INT,world);
|
||||
for (int i = 0; i < nprocs; i++)
|
||||
grid2proc[gridi[i][0]][gridi[i][1]][gridi[i][2]] = i;
|
||||
memory->destroy(gridi);
|
||||
|
||||
// get my neighbors
|
||||
|
||||
int minus, plus;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
numa_shift(myloc[i],procgrid[i],minus,plus);
|
||||
procneigh[i][0] = minus;
|
||||
procneigh[i][1] = plus;
|
||||
}
|
||||
procneigh[0][0] = grid2proc[procneigh[0][0]][myloc[1]][myloc[2]];
|
||||
procneigh[0][1] = grid2proc[procneigh[0][1]][myloc[1]][myloc[2]];
|
||||
procneigh[1][0] = grid2proc[myloc[0]][procneigh[1][0]][myloc[2]];
|
||||
procneigh[1][1] = grid2proc[myloc[0]][procneigh[1][1]][myloc[2]];
|
||||
procneigh[2][0] = grid2proc[myloc[0]][myloc[1]][procneigh[2][0]];
|
||||
procneigh[2][1] = grid2proc[myloc[0]][myloc[1]][procneigh[2][1]];
|
||||
|
||||
if (numa_rank == 0) MPI_Comm_free(&cartesian);
|
||||
MPI_Comm_free(&numa_leaders);
|
||||
MPI_Comm_free(&numa_comm);
|
||||
MPI_Comm_free(&node_comm);
|
||||
|
||||
// set lamda box params after procs are assigned
|
||||
|
||||
if (domain->triclinic) domain->set_lamda_box();
|
||||
|
||||
if (me == 0) {
|
||||
if (screen) fprintf(screen," %d by %d by %d Node grid\n",
|
||||
numagrid[0],numagrid[1],numagrid[2]);
|
||||
if (logfile) fprintf(logfile," %d by %d by %d Node grid\n",
|
||||
numagrid[0],numagrid[1],numagrid[2]);
|
||||
if (screen) fprintf(screen," %d by %d by %d processor grid\n",
|
||||
procgrid[0],procgrid[1],procgrid[2]);
|
||||
if (logfile) fprintf(logfile," %d by %d by %d processor grid\n",
|
||||
procgrid[0],procgrid[1],procgrid[2]);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
get the index to the neighboring processors in a dimension
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void Comm::numa_shift(int myloc, int num_procs, int &minus, int &plus)
|
||||
{
|
||||
minus = myloc - 1;
|
||||
if (minus < 0) minus = num_procs - 1;
|
||||
plus = myloc + 1;
|
||||
if (plus == num_procs) plus = 0;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
return # of bytes of allocated memory
|
||||
------------------------------------------------------------------------- */
|
||||
|
|
20
src/comm.h
20
src/comm.h
|
@ -40,7 +40,6 @@ class Comm : protected Pointers {
|
|||
virtual ~Comm();
|
||||
|
||||
virtual void init();
|
||||
void set_processors(int, char **); // set procs from input script
|
||||
virtual void set_proc_grid(); // setup 3d grid of procs
|
||||
virtual void setup(); // setup 3d comm pattern
|
||||
virtual void forward_comm(int dummy = 0); // forward comm of atom coords
|
||||
|
@ -57,7 +56,9 @@ class Comm : protected Pointers {
|
|||
virtual void forward_comm_dump(class Dump *); // forward comm from a Dump
|
||||
virtual void reverse_comm_dump(class Dump *); // reverse comm from a Dump
|
||||
|
||||
virtual void set(int, char **); // set communication style
|
||||
virtual void set(int, char **); // set communication style
|
||||
void set_processors(int, char **); // set 3d processor grid attributes
|
||||
|
||||
virtual bigint memory_usage();
|
||||
|
||||
protected:
|
||||
|
@ -82,8 +83,10 @@ class Comm : protected Pointers {
|
|||
int comm_x_only,comm_f_only; // 1 if only exchange x,f in for/rev comm
|
||||
int map_style; // non-0 if global->local mapping is done
|
||||
int bordergroup; // only communicate this group in borders
|
||||
int layoutflag; // user-selected layout for 3d proc grid
|
||||
int numa_nodes; // layout NUMA-aware 3d proc grid
|
||||
|
||||
int other_procgrid[3]; // proc layout
|
||||
int other_procgrid[3]; // proc layout of another partition
|
||||
|
||||
int *firstrecv; // where to put 1st recv atom in each swap
|
||||
int **sendlist; // list of atoms to send in each swap
|
||||
|
@ -93,11 +96,9 @@ class Comm : protected Pointers {
|
|||
double *buf_recv; // recv buffer for all comm
|
||||
int maxsend,maxrecv; // current size of send/recv buffer
|
||||
int maxforward,maxreverse; // max # of datums in forward/reverse comm
|
||||
|
||||
virtual void procs2box(); // map procs to 3d box
|
||||
virtual void cross(double, double, double,
|
||||
double, double, double,
|
||||
double &, double &, double &); // cross product
|
||||
|
||||
int procs2box(int, int[3], int[3], // map procs to 3d box
|
||||
const int, const int, const int, int);
|
||||
virtual void grow_send(int,int); // reallocate send buffer
|
||||
virtual void grow_recv(int); // free/allocate recv buffer
|
||||
virtual void grow_list(int, int); // reallocate one sendlist
|
||||
|
@ -106,6 +107,9 @@ class Comm : protected Pointers {
|
|||
virtual void allocate_multi(int); // allocate multi arrays
|
||||
virtual void free_swap(); // free swap arrays
|
||||
virtual void free_multi(); // free multi arrays
|
||||
|
||||
int numa_set_proc_grid();
|
||||
void numa_shift(int, int, int &, int &);
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -233,9 +233,9 @@ double MathExtra::dot3(const double *v1, const double *v2)
|
|||
|
||||
void MathExtra::cross3(const double *v1, const double *v2, double *ans)
|
||||
{
|
||||
ans[0] = v1[1]*v2[2]-v1[2]*v2[1];
|
||||
ans[1] = v1[2]*v2[0]-v1[0]*v2[2];
|
||||
ans[2] = v1[0]*v2[1]-v1[1]*v2[0];
|
||||
ans[0] = v1[1]*v2[2] - v1[2]*v2[1];
|
||||
ans[1] = v1[2]*v2[0] - v1[0]*v2[2];
|
||||
ans[2] = v1[0]*v2[1] - v1[1]*v2[0];
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
|
|
Loading…
Reference in New Issue