From f871ecdc67dbcdc10f0ffa07c407a965e2465a19 Mon Sep 17 00:00:00 2001 From: Steve Plimpton Date: Fri, 10 Mar 2017 15:55:07 -0700 Subject: [PATCH] change to RCB cuts in load-balancing commands, also a new option for fix halt --- doc/src/Manual.txt | 4 +- doc/src/balance.txt | 38 +-- doc/src/change_box.txt | 4 +- doc/src/create_atoms.txt | 11 + doc/src/fix_halt.txt | 54 +++- doc/src/quit.txt | 2 +- doc/src/thermo_style.txt | 2 +- src/balance.cpp | 23 +- src/balance.h | 1 + src/fix_halt.cpp | 57 ++++- src/fix_halt.h | 6 +- src/rcb.cpp | 523 ++++++++++++++++++++++++++++++++++++++- src/rcb.h | 2 + src/thermo.cpp | 8 +- src/version.h | 2 +- 15 files changed, 686 insertions(+), 51 deletions(-) diff --git a/doc/src/Manual.txt b/doc/src/Manual.txt index 375a1c097b..bb7a1452bc 100644 --- a/doc/src/Manual.txt +++ b/doc/src/Manual.txt @@ -1,7 +1,7 @@ LAMMPS Users Manual - + @@ -21,7 +21,7 @@

LAMMPS Documentation :c,h3 -7 Mar 2017 version :c,h4 +10 Mar 2017 version :c,h4 Version info: :h4 diff --git a/doc/src/balance.txt b/doc/src/balance.txt index 9aeb03392f..79728d6569 100644 --- a/doc/src/balance.txt +++ b/doc/src/balance.txt @@ -286,24 +286,32 @@ above. It performs a recursive coordinate bisectioning (RCB) of the simulation domain. The basic idea is as follows. The simulation domain is cut into 2 boxes by an axis-aligned cut in -the longest dimension, leaving one new box on either side of the cut. -All the processors are also partitioned into 2 groups, half assigned -to the box on the lower side of the cut, and half to the box on the -upper side. (If the processor count is odd, one side gets an extra -processor.) The cut is positioned so that the number of particles in -the lower box is exactly the number that the processors assigned to -that box should own for load balance to be perfect. This also makes -load balance for the upper box perfect. The positioning is done -iteratively, by a bisectioning method. Note that counting particles -on either side of the cut requires communication between all -processors at each iteration. +one of the dimensions, leaving one new sub-box on either side of the +cut. Which dimension is chosen for the cut depends on the particle +(weight) distribution within the parent box. Normally the longest +dimension of the box is cut, but if all (or most) of the particles are +at one end of the box, a cut may be performed in another dimension to +induce sub-boxes that are more cube-ish (3d) or square-ish (2d) in +shape. + +After the cut is made, all the processors are also partitioned into 2 +groups, half assigned to the box on the lower side of the cut, and +half to the box on the upper side. (If the processor count is odd, +one side gets an extra processor.) The cut is positioned so that the +number of (weighted) particles in the lower box is exactly the number +that the processors assigned to that box should own for load balance +to be perfect. This also makes load balance for the upper box +perfect. The positioning of the cut is done iteratively, by a +bisectioning method (median search). Note that counting particles on +either side of the cut requires communication between all processors +at each iteration. That is the procedure for the first cut. Subsequent cuts are made recursively, in exactly the same manner. The subset of processors -assigned to each box make a new cut in the longest dimension of that -box, splitting the box, the subset of processors, and the particles -in the box in two. The recursion continues until every processor is -assigned a sub-box of the entire simulation domain, and owns the +assigned to each box make a new cut in one dimension of that box, +splitting the box, the subset of processors, and the particles in the +box in two. The recursion continues until every processor is assigned +a sub-box of the entire simulation domain, and owns the (weighted) particles in that sub-box. :line diff --git a/doc/src/change_box.txt b/doc/src/change_box.txt index a41463baaf..2c7a890d4c 100644 --- a/doc/src/change_box.txt +++ b/doc/src/change_box.txt @@ -101,11 +101,11 @@ Instead you could do something like this, assuming the simulation box is non-periodic and atoms extend from 0 to 20 in all dimensions: change_box all x final -10 20 -create_atoms 1 single -5 5 5 # this will fail to insert an atom :pre +create_atoms 1 single -5 5 5 # this will fail to insert an atom :pre change_box all x final -10 20 boundary f s s create_atoms 1 single -5 5 5 -change_box boundary s s s # this will work :pre +change_box all boundary s s s # this will work :pre NOTE: Unlike the earlier "displace_box" version of this command, atom remapping is NOT performed by default. This command allows remapping diff --git a/doc/src/create_atoms.txt b/doc/src/create_atoms.txt index da9c8809d0..98c3c24a0b 100644 --- a/doc/src/create_atoms.txt +++ b/doc/src/create_atoms.txt @@ -134,6 +134,17 @@ not overlap existing atoms inappropriately, especially if molecules are being added. The "delete_atoms"_delete_atoms.html command can be used to remove overlapping atoms or molecules. +NOTE: You cannot use any of the styles explained above to create atoms +that are outside the simulation box; they will just be ignored by +LAMMPS. This is true even if you are using shrink-wrapped box +boundaries, as specified by the "boundary"_boundary.html command. +However, you can first use the "change_box"_change_box.html command to +temporarily expand the box, then add atoms via create_atoms, then +finally use change_box command again if needed to re-shrink-wrap the +new atoms. See the "change_box"_change_box.html doc page for an +example of how to do this, using the create_atoms {single} style to +insert a new atom outside the current simulation box. + :line Individual atoms are inserted by this command, unless the {mol} diff --git a/doc/src/fix_halt.txt b/doc/src/fix_halt.txt index c9295eca69..3f7650466f 100644 --- a/doc/src/fix_halt.txt +++ b/doc/src/fix_halt.txt @@ -15,15 +15,16 @@ fix ID group-ID halt N attribute operator avalue keyword value ... :pre ID, group-ID are documented in "fix"_fix.html command :ulb,l halt = style name of this fix command :l N = check halt condition every N steps :l -attribute = hstyle or v_name :l - hstyle = {bondmax} +attribute = {bondmax} or {tlimit} or v_name :l + bondmax = length of longest bond in the system + tlimit = elapsed CPU time v_name = name of "equal-style variable"_variable.html :pre operator = "<" or "<=" or ">" or ">=" or "==" or "!=" or "|^" :l avalue = numeric value to compare attribute to :l -string = text string to print with optional variable names :l zero or more keyword/value pairs may be appended :l -keyword = {error} :l - {error} value = {hard} or {soft} or {continue} :pre +keyword = {error} or {message} :l + {error} value = {hard} or {soft} or {continue} + {message} value = {yes} or {no} :pre :ule [Examples:] @@ -40,14 +41,33 @@ specified by the "run"_run.html or "minimize"_minimize.html command. The specified group-ID is ignored by this fix. -The specified {attribute} can be one of the {hstyle} options listed -above, or an "equal-style variable"_variable.html referenced as -{v_name}, where "name" is the name of a variable that has been defined -previously in the input script. +The specified {attribute} can be one of the options listed above, +namely {bondmax} or {tlimit}, or an "equal-style +variable"_variable.html referenced as {v_name}, where "name" is the +name of a variable that has been defined previously in the input +script. -The only {hstyle} option currently implemented is {bondmax}. This -will loop over all bonds in the system, compute their current -lengths, and set {attribute} to the longest bond distance. +The {bondmax} attribute will loop over all bonds in the system, +compute their current lengths, and set {attribute} to the longest bond +distance. + +The {tlimit} attribute queries the elapsed CPU time (in seconds) since +the current run began, and sets {attribute} to that value. This is an +alternative way to limit the length of a simulation run, similar to +the "timer"_timer.html timeout command. There are two differences in +using this method versus the timer command option. The first is that +the clock starts at the beginning of the current run (not when the +timer or fix command is specified), so that any setup time for the run +is not included in the elapsed time. The second is that the timer +invocation and syncing across all processors (via MPI_Allreduce) is +not performed once every {N} steps by this command. Instead it is +performed (typically) only a small number of times and the elapsed +times are used to predict when the end-of-the-run will be. Both of +these attributes can be useful when performing benchmark calculations +for a desired length of time with minmimal overhead. For example, if +a run is performing 1000s of timesteps/sec, the overhead for syncing +the timer frequently across a large number of processors may be +non-negligble. Equal-style variables evaluate to a numeric value. See the "variable"_variable.html command for a description. They calculate @@ -100,6 +120,14 @@ Note that you may wish use the "unfix"_unfix.html command on the fix halt ID, so that the same condition is not immediately triggered in a subsequent run. +The optional {message} keyword determines whether a message is printed +to the screen and logfile when the half condition is triggered. If +{message} is set to yes, a one line message with the values that +triggered the halt is printed. If {message} is set to no, no message +is printed; the run simply exits. The latter may be desirable for +post-processing tools that extract thermodyanmic information from log +files. + [Restart, fix_modify, output, run start/stop, minimize info:] No information about this fix is written to "binary restart @@ -118,4 +146,4 @@ This fix is not invoked during "energy minimization"_minimize.html. [Default:] -The option defaults are error = hard. +The option defaults are error = hard and message = yes. diff --git a/doc/src/quit.txt b/doc/src/quit.txt index 809fb2e38b..843d3de7f3 100644 --- a/doc/src/quit.txt +++ b/doc/src/quit.txt @@ -17,7 +17,7 @@ status = numerical exit status (optional) [Examples:] quit -if "$n > 10000" then "quit 1":pre +if "$n > 10000" then "quit 1" :pre [Description:] diff --git a/doc/src/thermo_style.txt b/doc/src/thermo_style.txt index e30e7023e4..36ec7bf12e 100644 --- a/doc/src/thermo_style.txt +++ b/doc/src/thermo_style.txt @@ -36,7 +36,7 @@ args = list of arguments for a particular style :l elaplong = timesteps since start of initial run in a series of runs dt = timestep size time = simulation time - cpu = elapsed CPU time in seconds + cpu = elapsed CPU time in seconds since start of this run tpcpu = time per CPU second spcpu = timesteps per CPU second cpuremain = estimated CPU time remaining in run diff --git a/src/balance.cpp b/src/balance.cpp index 050f282dfe..52f6072a6c 100644 --- a/src/balance.cpp +++ b/src/balance.cpp @@ -456,6 +456,7 @@ void Balance::options(int iarg, int narg, char **arg) wtflag = 0; varflag = 0; + oldrcb = 0; outflag = 0; int outarg = 0; fp = NULL; @@ -491,6 +492,9 @@ void Balance::options(int iarg, int narg, char **arg) } iarg += 2+nopt; + } else if (strcmp(arg[iarg],"old") == 0) { + oldrcb = 1; + iarg++; } else if (strcmp(arg[iarg],"out") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal (fix) balance command"); outflag = 1; @@ -641,12 +645,21 @@ int *Balance::bisection(int sortflag) // invoke RCB // then invert() to create list of proc assignments for my atoms + // NOTE: (3/2017) can remove undocumented "old" option at some point + // ditto in rcb.cpp - if (wtflag) { - weight = fixstore->vstore; - rcb->compute(dim,atom->nlocal,atom->x,weight,shrinklo,shrinkhi); - } else rcb->compute(dim,atom->nlocal,atom->x,NULL,shrinklo,shrinkhi); - + if (oldrcb) { + if (wtflag) { + weight = fixstore->vstore; + rcb->compute_old(dim,atom->nlocal,atom->x,weight,shrinklo,shrinkhi); + } else rcb->compute_old(dim,atom->nlocal,atom->x,NULL,shrinklo,shrinkhi); + } else { + if (wtflag) { + weight = fixstore->vstore; + rcb->compute(dim,atom->nlocal,atom->x,weight,shrinklo,shrinkhi); + } else rcb->compute(dim,atom->nlocal,atom->x,NULL,shrinklo,shrinkhi); + } + rcb->invert(sortflag); // reset RCB lo/hi bounding box to full simulation box as needed diff --git a/src/balance.h b/src/balance.h index 82941b34c0..0f2f79bb15 100644 --- a/src/balance.h +++ b/src/balance.h @@ -53,6 +53,7 @@ class Balance : protected Pointers { int style; // style of LB int xflag,yflag,zflag; // xyz LB flags double *user_xsplit,*user_ysplit,*user_zsplit; // params for xyz LB + int oldrcb; // use old-style RCB compute int nitermax; // params for shift LB double stopthresh; diff --git a/src/fix_halt.cpp b/src/fix_halt.cpp index ca74efa454..2a4af4dc66 100644 --- a/src/fix_halt.cpp +++ b/src/fix_halt.cpp @@ -30,9 +30,10 @@ using namespace LAMMPS_NS; using namespace FixConst; -enum{BONDMAX,VARIABLE}; +enum{BONDMAX,TLIMIT,VARIABLE}; enum{LT,LE,GT,GE,EQ,NEQ,XOR}; enum{HARD,SOFT,CONTINUE}; +enum{NOMSG,YESMSG}; /* ---------------------------------------------------------------------- */ @@ -47,7 +48,8 @@ FixHalt::FixHalt(LAMMPS *lmp, int narg, char **arg) : idvar = NULL; - if (strcmp(arg[4],"bondmax") == 0) attribute = BONDMAX; + if (strcmp(arg[4],"tlimit") == 0) attribute = TLIMIT; + else if (strcmp(arg[4],"bondmax") == 0) attribute = BONDMAX; else if (strncmp(arg[4],"v_",2) == 0) { attribute = VARIABLE; int n = strlen(arg[4]); @@ -73,6 +75,7 @@ FixHalt::FixHalt(LAMMPS *lmp, int narg, char **arg) : // parse optional args eflag = SOFT; + msgflag = YESMSG; int iarg = 7; while (iarg < narg) { @@ -83,6 +86,12 @@ FixHalt::FixHalt(LAMMPS *lmp, int narg, char **arg) : else if (strcmp(arg[iarg+1],"continue") == 0) eflag = CONTINUE; else error->all(FLERR,"Illegal fix halt command"); iarg += 2; + } else if (strcmp(arg[iarg],"message") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal fix halt command"); + if (strcmp(arg[iarg+1],"no") == 0) msgflag = NOMSG; + else if (strcmp(arg[iarg+1],"yes") == 0) msgflag = YESMSG; + else error->all(FLERR,"Illegal fix halt command"); + iarg += 2; } else error->all(FLERR,"Illegal fix halt command"); } @@ -125,6 +134,11 @@ void FixHalt::init() if (input->variable->equalstyle(ivar) == 0) error->all(FLERR,"Fix halt variable is not equal-style variable"); } + + // settings used by TLIMIT + + nextstep = (update->ntimestep/nevery)*nevery + nevery; + tratio = 0.5; } /* ---------------------------------------------------------------------- */ @@ -135,8 +149,12 @@ void FixHalt::end_of_step() double attvalue; - if (attribute == BONDMAX) attvalue = bondmax(); - else { + if (attribute == TLIMIT) { + if (update->ntimestep != nextstep) return; + attvalue = tlimit(); + } else if (attribute == BONDMAX) { + attvalue = bondmax(); + } else { modify->clearstep_compute(); attvalue = input->variable->compute_equal(ivar); modify->addstep_compute(update->ntimestep + nevery); @@ -169,9 +187,10 @@ void FixHalt::end_of_step() sprintf(str,"Fix halt %s condition met on step %ld with value %g", id,update->ntimestep,attvalue); - if (eflag == HARD) error->all(FLERR,str); - else if (eflag == SOFT || eflag == CONTINUE) { - if (comm->me == 0) error->message(FLERR,str); + if (eflag == HARD) { + error->all(FLERR,str); + } else if (eflag == SOFT || eflag == CONTINUE) { + if (comm->me == 0 && msgflag == YESMSG) error->message(FLERR,str); timer->force_timeout(); } } @@ -218,3 +237,27 @@ double FixHalt::bondmax() return sqrt(maxall); } + +/* ---------------------------------------------------------------------- + compute synced elapsed time + reset nextstep = estimate of timestep when run will end + first project to 1/2 the run time, thereafter to end of run +------------------------------------------------------------------------- */ + +double FixHalt::tlimit() +{ + double cpu = timer->elapsed(Timer::TOTAL); + MPI_Bcast(&cpu,1,MPI_DOUBLE,0,world); + + if (cpu < value) { + bigint elapsed = update->ntimestep - update->firststep; + bigint final = update->firststep + + static_cast (tratio*value/cpu * elapsed); + nextstep = (final/nevery)*nevery + nevery; + tratio = 1.0; + } + + //printf("EVAL %ld %g %d\n",update->ntimestep,cpu,nevery); + + return cpu; +} diff --git a/src/fix_halt.h b/src/fix_halt.h index 156397a4b5..52d2f95d2f 100644 --- a/src/fix_halt.h +++ b/src/fix_halt.h @@ -35,11 +35,13 @@ class FixHalt : public Fix { void post_run(); private: - int attribute,operation,eflag,ivar; - double value; + int attribute,operation,eflag,msgflag,ivar; + bigint nextstep; + double value,tratio; char *idvar; double bondmax(); + double tlimit(); }; } diff --git a/src/rcb.cpp b/src/rcb.cpp index 7223e4616a..1aca015daf 100644 --- a/src/rcb.cpp +++ b/src/rcb.cpp @@ -42,7 +42,7 @@ RCB::RCB(LAMMPS *lmp) : Pointers(lmp) dots = NULL; nlist = maxlist = 0; - dotlist = dotmark = NULL; + dotlist = dotmark = dotmark_select = NULL; maxbuf = 0; buf = NULL; @@ -73,6 +73,7 @@ RCB::~RCB() memory->sfree(dots); memory->destroy(dotlist); memory->destroy(dotmark); + memory->destroy(dotmark_select); memory->sfree(buf); memory->destroy(recvproc); @@ -91,6 +92,9 @@ RCB::~RCB() /* ---------------------------------------------------------------------- perform RCB balancing of N particles at coords X in bounding box LO/HI + NEW version: each RCB cut is tested in all dimensions + dimeension that produces 2 boxes with largest min size is selected + this is to prevent very narrow boxes from being produced if wt = NULL, ignore per-particle weights if wt defined, per-particle weights > 0.0 dimension = 2 or 3 @@ -103,6 +107,523 @@ RCB::~RCB() void RCB::compute(int dimension, int n, double **x, double *wt, double *bboxlo, double *bboxhi) +{ + int i,j,k; + int keep,outgoing,incoming,incoming2; + int dim,markactive; + int indexlo,indexhi; + int first_iteration,breakflag; + double wttot,wtlo,wthi,wtsum,wtok,wtupto,wtmax; + double targetlo,targethi; + double valuemin,valuemax,valuehalf,valuehalf_select,smaller; + double tolerance; + MPI_Comm comm,comm_half; + MPI_Request request,request2; + Median med,medme; + + // create list of my Dots + + ndot = nkeep = noriginal = n; + + if (ndot > maxdot) { + maxdot = ndot; + memory->sfree(dots); + dots = (Dot *) memory->smalloc(ndot*sizeof(Dot),"RCB:dots"); + } + + for (i = 0; i < ndot; i++) { + dots[i].x[0] = x[i][0]; + dots[i].x[1] = x[i][1]; + dots[i].x[2] = x[i][2]; + dots[i].proc = me; + dots[i].index = i; + } + + if (wt) + for (i = 0; i < ndot; i++) dots[i].wt = wt[i]; + else + for (i = 0; i < ndot; i++) dots[i].wt = 1.0; + + // initial bounding box = simulation box + // includes periodic or shrink-wrapped boundaries + + lo = bbox.lo; + hi = bbox.hi; + + lo[0] = bboxlo[0]; + lo[1] = bboxlo[1]; + lo[2] = bboxlo[2]; + hi[0] = bboxhi[0]; + hi[1] = bboxhi[1]; + hi[2] = bboxhi[2]; + + cut = 0.0; + cutdim = -1; + + // initialize counters + + counters[0] = 0; + counters[1] = 0; + counters[2] = 0; + counters[3] = ndot; + counters[4] = maxdot; + counters[5] = 0; + counters[6] = 0; + + // create communicator for use in recursion + + MPI_Comm_dup(world,&comm); + + // recurse until partition is a single proc = me + // proclower,procupper = lower,upper procs in partition + // procmid = 1st proc in upper half of partition + + int procpartner,procpartner2; + + int procmid; + int proclower = 0; + int procupper = nprocs - 1; + + while (proclower != procupper) { + + // if odd # of procs, lower partition gets extra one + + procmid = proclower + (procupper - proclower) / 2 + 1; + + // determine communication partner(s) + // readnumber = # of proc partners to read from + + if (me < procmid) + procpartner = me + (procmid - proclower); + else + procpartner = me - (procmid - proclower); + + int readnumber = 1; + if (procpartner > procupper) { + readnumber = 0; + procpartner--; + } + if (me == procupper && procpartner != procmid - 1) { + readnumber = 2; + procpartner2 = procpartner + 1; + } + + // wttot = summed weight of entire partition + // search tolerance = largest single weight (plus epsilon) + // targetlo = desired weight in lower half of partition + // targethi = desired weight in upper half of partition + + wtmax = wtsum = 0.0; + + if (wt) { + for (i = 0; i < ndot; i++) { + wtsum += dots[i].wt; + if (dots[i].wt > wtmax) wtmax = dots[i].wt; + } + } else { + for (i = 0; i < ndot; i++) wtsum += dots[i].wt; + wtmax = 1.0; + } + + MPI_Allreduce(&wtsum,&wttot,1,MPI_DOUBLE,MPI_SUM,comm); + if (wt) MPI_Allreduce(&wtmax,&tolerance,1,MPI_DOUBLE,MPI_MAX,comm); + else tolerance = 1.0; + + tolerance *= 1.0 + TINY; + targetlo = wttot * (procmid - proclower) / (procupper + 1 - proclower); + targethi = wttot - targetlo; + + // attempt a cut in each dimension + // each cut produces 2 boxes, each with a reduced box length in that dim + // smaller = the smaller of the 2 reduced box lengths in that dimension + // choose to cut in dimension which produces largest smaller value + // this should induce final proc sub-boxes to be as cube-ish as possible + // dim_select = selected cut dimension + // valuehalf_select = valuehalf in that dimension + // dotmark_select = dot markings in that dimension + + int dim_select = -1; + double largest = 0.0; + + for (dim = 0; dim < dimension; dim++) { + + // create active list and mark array for dots + // initialize active list to all dots + + if (ndot > maxlist) { + memory->destroy(dotlist); + memory->destroy(dotmark); + memory->destroy(dotmark_select); + maxlist = maxdot; + memory->create(dotlist,maxlist,"RCB:dotlist"); + memory->create(dotmark,maxlist,"RCB:dotmark"); + memory->create(dotmark_select,maxlist,"RCB:dotmark_select"); + } + + nlist = ndot; + for (i = 0; i < nlist; i++) dotlist[i] = i; + + // median iteration + // zoom in on bisector until correct # of dots in each half of partition + // as each iteration of median-loop begins, require: + // all non-active dots are marked with 0/1 in dotmark + // valuemin <= every active dot <= valuemax + // wtlo, wthi = total wt of non-active dots + // when leave median-loop, require only: + // valuehalf = correct cut position + // all dots <= valuehalf are marked with 0 in dotmark + // all dots >= valuehalf are marked with 1 in dotmark + // markactive = which side of cut is active = 0/1 + // indexlo,indexhi = indices of dot closest to median + + wtlo = wthi = 0.0; + valuemin = lo[dim]; + valuemax = hi[dim]; + first_iteration = 1; + indexlo = indexhi = 0; + + while (1) { + + // choose bisector value + // use old value on 1st iteration if old cut dimension is the same + // on 2nd option: could push valuehalf towards geometric center + // with "1.0-factor" to force overshoot + + if (first_iteration && reuse && dim == tree[procmid].dim) { + counters[5]++; + valuehalf = tree[procmid].cut; + if (valuehalf < valuemin || valuehalf > valuemax) + valuehalf = 0.5 * (valuemin + valuemax); + } else if (wt) + valuehalf = valuemin + (targetlo - wtlo) / + (wttot - wtlo - wthi) * (valuemax - valuemin); + else + valuehalf = 0.5 * (valuemin + valuemax); + + first_iteration = 0; + + // initialize local median data structure + + medme.totallo = medme.totalhi = 0.0; + medme.valuelo = -MYHUGE; + medme.valuehi = MYHUGE; + medme.wtlo = medme.wthi = 0.0; + medme.countlo = medme.counthi = 0; + medme.proclo = medme.prochi = me; + + // mark all active dots on one side or other of bisector + // also set all fields in median data struct + // save indices of closest dots on either side + + for (j = 0; j < nlist; j++) { + i = dotlist[j]; + if (dots[i].x[dim] <= valuehalf) { // in lower part + medme.totallo += dots[i].wt; + dotmark[i] = 0; + if (dots[i].x[dim] > medme.valuelo) { // my closest dot + medme.valuelo = dots[i].x[dim]; + medme.wtlo = dots[i].wt; + medme.countlo = 1; + indexlo = i; + } else if (dots[i].x[dim] == medme.valuelo) { // tied for closest + medme.wtlo += dots[i].wt; + medme.countlo++; + } + } + else { // in upper part + medme.totalhi += dots[i].wt; + dotmark[i] = 1; + if (dots[i].x[dim] < medme.valuehi) { // my closest dot + medme.valuehi = dots[i].x[dim]; + medme.wthi = dots[i].wt; + medme.counthi = 1; + indexhi = i; + } else if (dots[i].x[dim] == medme.valuehi) { // tied for closest + medme.wthi += dots[i].wt; + medme.counthi++; + } + } + } + + // combine median data struct across current subset of procs + + counters[0]++; + MPI_Allreduce(&medme,&med,1,med_type,med_op,comm); + + // test median guess for convergence + // move additional dots that are next to cut across it + + if (wtlo + med.totallo < targetlo) { // lower half TOO SMALL + + wtlo += med.totallo; + valuehalf = med.valuehi; + + if (med.counthi == 1) { // only one dot to move + if (wtlo + med.wthi < targetlo) { // move it, keep iterating + if (me == med.prochi) dotmark[indexhi] = 0; + } + else { // only move if beneficial + if (wtlo + med.wthi - targetlo < targetlo - wtlo) + if (me == med.prochi) dotmark[indexhi] = 0; + break; // all done + } + } + else { // multiple dots to move + breakflag = 0; + wtok = 0.0; + if (medme.valuehi == med.valuehi) wtok = medme.wthi; + if (wtlo + med.wthi >= targetlo) { // all done + MPI_Scan(&wtok,&wtupto,1,MPI_DOUBLE,MPI_SUM,comm); + wtmax = targetlo - wtlo; + if (wtupto > wtmax) wtok = wtok - (wtupto - wtmax); + breakflag = 1; + } // wtok = most I can move + for (j = 0, wtsum = 0.0; j < nlist && wtsum < wtok; j++) { + i = dotlist[j]; + if (dots[i].x[dim] == med.valuehi) { // only move if better + if (wtsum + dots[i].wt - wtok < wtok - wtsum) + dotmark[i] = 0; + wtsum += dots[i].wt; + } + } + if (breakflag) break; // done if moved enough + } + + wtlo += med.wthi; + if (targetlo-wtlo <= tolerance) break; // close enough + + valuemin = med.valuehi; // iterate again + markactive = 1; + } + + else if (wthi + med.totalhi < targethi) { // upper half TOO SMALL + + wthi += med.totalhi; + valuehalf = med.valuelo; + + if (med.countlo == 1) { // only one dot to move + if (wthi + med.wtlo < targethi) { // move it, keep iterating + if (me == med.proclo) dotmark[indexlo] = 1; + } + else { // only move if beneficial + if (wthi + med.wtlo - targethi < targethi - wthi) + if (me == med.proclo) dotmark[indexlo] = 1; + break; // all done + } + } + else { // multiple dots to move + breakflag = 0; + wtok = 0.0; + if (medme.valuelo == med.valuelo) wtok = medme.wtlo; + if (wthi + med.wtlo >= targethi) { // all done + MPI_Scan(&wtok,&wtupto,1,MPI_DOUBLE,MPI_SUM,comm); + wtmax = targethi - wthi; + if (wtupto > wtmax) wtok = wtok - (wtupto - wtmax); + breakflag = 1; + } // wtok = most I can move + for (j = 0, wtsum = 0.0; j < nlist && wtsum < wtok; j++) { + i = dotlist[j]; + if (dots[i].x[dim] == med.valuelo) { // only move if better + if (wtsum + dots[i].wt - wtok < wtok - wtsum) + dotmark[i] = 1; + wtsum += dots[i].wt; + } + } + if (breakflag) break; // done if moved enough + } + + wthi += med.wtlo; + if (targethi-wthi <= tolerance) break; // close enough + + valuemax = med.valuelo; // iterate again + markactive = 0; + } + + else // Goldilocks result: both partitions just right + break; + + // shrink the active list + + k = 0; + for (j = 0; j < nlist; j++) { + i = dotlist[j]; + if (dotmark[i] == markactive) dotlist[k++] = i; + } + nlist = k; + } + + // cut produces 2 sub-boxes with reduced size in dim + // compare smaller of the 2 sizes to previous dims + // keep dim that has the largest smaller + + smaller = MIN(valuehalf-lo[dim],hi[dim]-valuehalf); + if (smaller > largest) { + largest = smaller; + dim_select = dim; + valuehalf_select = valuehalf; + memcpy(dotmark_select,dotmark,ndot*sizeof(int)); + } + } + + // copy results for best dim cut into dim,valuehalf,dotmark + + dim = dim_select; + valuehalf = valuehalf_select; + memcpy(dotmark,dotmark_select,ndot*sizeof(int)); + + // found median + // store cut info only if I am procmid + + if (me == procmid) { + cut = valuehalf; + cutdim = dim; + } + + // use cut to shrink my RCB bounding box + + if (me < procmid) hi[dim] = valuehalf; + else lo[dim] = valuehalf; + + // outgoing = number of dots to ship to partner + // nkeep = number of dots that have never migrated + + markactive = (me < procpartner); + for (i = 0, keep = 0, outgoing = 0; i < ndot; i++) + if (dotmark[i] == markactive) outgoing++; + else if (i < nkeep) keep++; + nkeep = keep; + + // alert partner how many dots I'll send, read how many I'll recv + + MPI_Send(&outgoing,1,MPI_INT,procpartner,0,world); + incoming = 0; + if (readnumber) { + MPI_Recv(&incoming,1,MPI_INT,procpartner,0,world,MPI_STATUS_IGNORE); + if (readnumber == 2) { + MPI_Recv(&incoming2,1,MPI_INT,procpartner2,0,world,MPI_STATUS_IGNORE); + incoming += incoming2; + } + } + + // check if need to alloc more space + + int ndotnew = ndot - outgoing + incoming; + if (ndotnew > maxdot) { + while (maxdot < ndotnew) maxdot += DELTA; + dots = (Dot *) memory->srealloc(dots,maxdot*sizeof(Dot),"RCB::dots"); + counters[6]++; + } + + counters[1] += outgoing; + counters[2] += incoming; + if (ndotnew > counters[3]) counters[3] = ndotnew; + if (maxdot > counters[4]) counters[4] = maxdot; + + // malloc comm send buffer + + if (outgoing > maxbuf) { + memory->sfree(buf); + maxbuf = outgoing; + buf = (Dot *) memory->smalloc(maxbuf*sizeof(Dot),"RCB:buf"); + } + + // fill buffer with dots that are marked for sending + // pack down the unmarked ones + + keep = outgoing = 0; + for (i = 0; i < ndot; i++) { + if (dotmark[i] == markactive) + memcpy(&buf[outgoing++],&dots[i],sizeof(Dot)); + else + memcpy(&dots[keep++],&dots[i],sizeof(Dot)); + } + + // post receives for dots + + if (readnumber > 0) { + MPI_Irecv(&dots[keep],incoming*sizeof(Dot),MPI_CHAR, + procpartner,1,world,&request); + if (readnumber == 2) { + keep += incoming - incoming2; + MPI_Irecv(&dots[keep],incoming2*sizeof(Dot),MPI_CHAR, + procpartner2,1,world,&request2); + } + } + + // handshake before sending dots to insure recvs have been posted + + if (readnumber > 0) { + MPI_Send(NULL,0,MPI_INT,procpartner,0,world); + if (readnumber == 2) MPI_Send(NULL,0,MPI_INT,procpartner2,0,world); + } + MPI_Recv(NULL,0,MPI_INT,procpartner,0,world,MPI_STATUS_IGNORE); + + // send dots to partner + + MPI_Rsend(buf,outgoing*sizeof(Dot),MPI_CHAR,procpartner,1,world); + + // wait until all dots are received + + if (readnumber > 0) { + MPI_Wait(&request,MPI_STATUS_IGNORE); + if (readnumber == 2) MPI_Wait(&request2,MPI_STATUS_IGNORE); + } + + ndot = ndotnew; + + // cut partition in half, create new communicators of 1/2 size + + int split; + if (me < procmid) { + procupper = procmid - 1; + split = 0; + } else { + proclower = procmid; + split = 1; + } + + MPI_Comm_split(comm,split,me,&comm_half); + MPI_Comm_free(&comm); + comm = comm_half; + } + + // clean up + + MPI_Comm_free(&comm); + + // set public variables with results of rebalance + + nfinal = ndot; + + if (nfinal > maxrecv) { + memory->destroy(recvproc); + memory->destroy(recvindex); + maxrecv = nfinal; + memory->create(recvproc,maxrecv,"RCB:recvproc"); + memory->create(recvindex,maxrecv,"RCB:recvindex"); + } + + for (i = 0; i < nfinal; i++) { + recvproc[i] = dots[i].proc; + recvindex[i] = dots[i].index; + } +} + +/* ---------------------------------------------------------------------- + perform RCB balancing of N particles at coords X in bounding box LO/HI + OLD version: each RCB cut is made in longest dimension of sub-box + if wt = NULL, ignore per-particle weights + if wt defined, per-particle weights > 0.0 + dimension = 2 or 3 + as documented in rcb.h: + sets noriginal,nfinal,nkeep,recvproc,recvindex,lo,hi + all proc particles will be inside or on surface of 3-d box + defined by final lo/hi + // NOTE: worry about re-use of data structs for fix balance? +------------------------------------------------------------------------- */ + +void RCB::compute_old(int dimension, int n, double **x, double *wt, + double *bboxlo, double *bboxhi) { int i,j,k; int keep,outgoing,incoming,incoming2; diff --git a/src/rcb.h b/src/rcb.h index 7a82d42f93..90b82f5952 100644 --- a/src/rcb.h +++ b/src/rcb.h @@ -42,6 +42,7 @@ class RCB : protected Pointers { RCB(class LAMMPS *); ~RCB(); void compute(int, int, double **, double *, double *, double *); + void compute_old(int, int, double **, double *, double *, double *); void invert(int sortflag = 0); bigint memory_usage(); @@ -99,6 +100,7 @@ class RCB : protected Pointers { int maxlist; int *dotlist; int *dotmark; + int *dotmark_select; int maxbuf; Dot *buf; diff --git a/src/thermo.cpp b/src/thermo.cpp index 75e72ada64..dbbeff4998 100644 --- a/src/thermo.cpp +++ b/src/thermo.cpp @@ -55,7 +55,8 @@ using namespace MathConst; // customize a new keyword by adding to this list: -// step, elapsed, elaplong, dt, time, cpu, tpcpu, spcpu, cpuremain, part, timeremain +// step, elapsed, elaplong, dt, time, cpu, tpcpu, spcpu, cpuremain, +// part, timeremain // atoms, temp, press, pe, ke, etotal, enthalpy // evdwl, ecoul, epair, ebond, eangle, edihed, eimp, emol, elong, etail // vol, density, lx, ly, lz, xlo, xhi, ylo, yhi, zlo, zhi, xy, xz, yz, @@ -394,6 +395,11 @@ void Thermo::compute(int flag) if (flushflag) fflush(logfile); } } + + // set to 1, so that subsequent invocations of CPU time will be non-zero + // e.g. via variables in print command + + firststep = 1; } /* ---------------------------------------------------------------------- diff --git a/src/version.h b/src/version.h index 703db28c26..e0aa18371c 100644 --- a/src/version.h +++ b/src/version.h @@ -1 +1 @@ -#define LAMMPS_VERSION "7 Mar 2017" +#define LAMMPS_VERSION "10 Mar 2017"