From 31ed3f7178a349b5d47827eb8fefea90b034e7cd Mon Sep 17 00:00:00 2001
From: "Steven J. Plimpton" <sjplimp@singsing.sandia.gov>
Date: Mon, 27 Aug 2018 17:09:59 -0600
Subject: [PATCH] more changes to insure each triplet IJK computed exactly once

---
 ...og.23Aug18.atm.g++.1 => log.27Aug18.g++.1} | 32 +++++------
 ...og.23Aug18.atm.g++.4 => log.27Aug18.g++.4} | 32 +++++------
 src/MANYBODY/pair_atm.cpp                     | 54 ++++++++-----------
 3 files changed, 53 insertions(+), 65 deletions(-)
 rename examples/atm/{log.23Aug18.atm.g++.1 => log.27Aug18.g++.1} (69%)
 rename examples/atm/{log.23Aug18.atm.g++.4 => log.27Aug18.g++.4} (69%)

diff --git a/examples/atm/log.23Aug18.atm.g++.1 b/examples/atm/log.27Aug18.g++.1
similarity index 69%
rename from examples/atm/log.23Aug18.atm.g++.1
rename to examples/atm/log.27Aug18.g++.1
index 3373846b31..46215e108c 100644
--- a/examples/atm/log.23Aug18.atm.g++.1
+++ b/examples/atm/log.27Aug18.g++.1
@@ -26,7 +26,7 @@ Created orthogonal box = (0 0 0) to (18.3252 18.3252 18.3252)
   1 by 1 by 1 MPI processor grid
 create_atoms	1 box
 Created 4000 atoms
-  Time spent = 0.00126314 secs
+  Time spent = 0.00139618 secs
 
 pair_style	hybrid/overlay lj/cut 4.5 atm 4.5 2.5
 pair_coeff	* * lj/cut 1.0 1.0
@@ -60,26 +60,26 @@ Neighbor list info ...
       bin: standard
 Per MPI rank memory allocation (min/avg/max) = 11.47 | 11.47 | 11.47 Mbytes
 Step Temp E_pair E_mol TotEng Press 
-       0        1.033   -4.8899813            0   -3.3408686   -4.2298176 
-       5    1.0337853   -4.8928208            0   -3.3425304   -4.2233154 
-      10    1.0358056   -4.8953304            0   -3.3420104   -4.1897183 
-      15    1.0380938   -4.8990457            0   -3.3422942   -4.1310148 
-      20    1.0389566   -4.9014345            0   -3.3433892   -4.0406616 
-      25    1.0358313   -4.8989663            0   -3.3456079   -3.9093019 
-Loop time of 12.2062 on 1 procs for 25 steps with 4000 atoms
+       0        1.033   -4.8404387            0    -3.291326   -4.1332095 
+       5    1.0337247   -4.8402263            0    -3.290027   -4.1207962 
+      10    1.0355935   -4.8425889            0   -3.2895869   -4.0870158 
+      15    1.0376519     -4.84599            0   -3.2899013   -4.0278711 
+      20    1.0382257   -4.8478854            0   -3.2909361   -3.9368052 
+      25    1.0347886     -4.84473            0   -3.2929351   -3.8044469 
+Loop time of 15.95 on 1 procs for 25 steps with 4000 atoms
 
-Performance: 353.920 tau/day, 2.048 timesteps/s
-99.9% CPU use with 1 MPI tasks x no OpenMP threads
+Performance: 270.846 tau/day, 1.567 timesteps/s
+100.0% CPU use with 1 MPI tasks x no OpenMP threads
 
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 12.202     | 12.202     | 12.202     |   0.0 | 99.96
+Pair    | 15.946     | 15.946     | 15.946     |   0.0 | 99.97
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 0.0015621  | 0.0015621  | 0.0015621  |   0.0 |  0.01
-Output  | 0.00020814 | 0.00020814 | 0.00020814 |   0.0 |  0.00
-Modify  | 0.0019698  | 0.0019698  | 0.0019698  |   0.0 |  0.02
-Other   |            | 0.0007734  |            |       |  0.01
+Comm    | 0.0015042  | 0.0015042  | 0.0015042  |   0.0 |  0.01
+Output  | 0.00013781 | 0.00013781 | 0.00013781 |   0.0 |  0.00
+Modify  | 0.0017776  | 0.0017776  | 0.0017776  |   0.0 |  0.01
+Other   |            | 0.0006771  |            |       |  0.00
 
 Nlocal:    4000 ave 4000 max 4000 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
@@ -97,4 +97,4 @@ Dangerous builds = 0
 
 Please see the log.cite file for references relevant to this simulation
 
-Total wall time: 0:00:13
+Total wall time: 0:00:16
diff --git a/examples/atm/log.23Aug18.atm.g++.4 b/examples/atm/log.27Aug18.g++.4
similarity index 69%
rename from examples/atm/log.23Aug18.atm.g++.4
rename to examples/atm/log.27Aug18.g++.4
index d5edfe32ba..d84f17ee2b 100644
--- a/examples/atm/log.23Aug18.atm.g++.4
+++ b/examples/atm/log.27Aug18.g++.4
@@ -26,7 +26,7 @@ Created orthogonal box = (0 0 0) to (18.3252 18.3252 18.3252)
   1 by 2 by 2 MPI processor grid
 create_atoms	1 box
 Created 4000 atoms
-  Time spent = 0.000785112 secs
+  Time spent = 0.000900984 secs
 
 pair_style	hybrid/overlay lj/cut 4.5 atm 4.5 2.5
 pair_coeff	* * lj/cut 1.0 1.0
@@ -60,26 +60,26 @@ Neighbor list info ...
       bin: standard
 Per MPI rank memory allocation (min/avg/max) = 5.532 | 5.532 | 5.532 Mbytes
 Step Temp E_pair E_mol TotEng Press 
-       0        1.033   -4.8921547            0    -3.343042   -4.2340557 
-       5    1.0337949   -4.8947881            0   -3.3444835   -4.2271456 
-      10    1.0358286   -4.8973178            0   -3.3439632   -4.1935779 
-      15    1.0381322   -4.9010593            0   -3.3442503    -4.134913 
-      20    1.0390107   -4.9034854            0   -3.3453589   -4.0446162 
-      25    1.0358988   -4.9010506            0   -3.3475908   -3.9133006 
-Loop time of 3.20632 on 4 procs for 25 steps with 4000 atoms
+       0        1.033   -4.8404387            0    -3.291326   -4.1332095 
+       5    1.0337247   -4.8402263            0    -3.290027   -4.1207962 
+      10    1.0355935   -4.8425889            0   -3.2895869   -4.0870158 
+      15    1.0376519     -4.84599            0   -3.2899013   -4.0278711 
+      20    1.0382257   -4.8478854            0   -3.2909361   -3.9368052 
+      25    1.0347886     -4.84473            0   -3.2929351   -3.8044469 
+Loop time of 4.34636 on 4 procs for 25 steps with 4000 atoms
 
-Performance: 1347.340 tau/day, 7.797 timesteps/s
-100.0% CPU use with 4 MPI tasks x no OpenMP threads
+Performance: 993.935 tau/day, 5.752 timesteps/s
+99.6% CPU use with 4 MPI tasks x no OpenMP threads
 
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 3.1207     | 3.1553     | 3.1859     |   1.5 | 98.41
+Pair    | 3.9977     | 4.1036     | 4.209      |   4.9 | 94.41
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 0.019466   | 0.05009    | 0.084602   |  12.0 |  1.56
-Output  | 7.1049e-05 | 8.2076e-05 | 0.00011325 |   0.0 |  0.00
-Modify  | 0.00056338 | 0.00057292 | 0.00058413 |   0.0 |  0.02
-Other   |            | 0.0003092  |            |       |  0.01
+Comm    | 0.13588    | 0.24134    | 0.34722    |  20.4 |  5.55
+Output  | 0.00013757 | 0.00015104 | 0.00016761 |   0.0 |  0.00
+Modify  | 0.00087953 | 0.00091547 | 0.00095582 |   0.0 |  0.02
+Other   |            | 0.0003656  |            |       |  0.01
 
 Nlocal:    1000 ave 1000 max 1000 min
 Histogram: 4 0 0 0 0 0 0 0 0 0
@@ -97,4 +97,4 @@ Dangerous builds = 0
 
 Please see the log.cite file for references relevant to this simulation
 
-Total wall time: 0:00:03
+Total wall time: 0:00:04
diff --git a/src/MANYBODY/pair_atm.cpp b/src/MANYBODY/pair_atm.cpp
index b63407dfe9..a6e52faeba 100644
--- a/src/MANYBODY/pair_atm.cpp
+++ b/src/MANYBODY/pair_atm.cpp
@@ -98,9 +98,15 @@ void PairATM::compute(int eflag, int vflag)
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
 
-  int count1 = 0;
-  int count2 = 0;
-  int count3 = 0;
+  // triple loop over local atoms and neighbors twice
+  // must compute each IJK triplet interaction exactly once
+  // by proc that owns the triplet atom with smallest x coord
+  //   special logic to break ties if multiple atoms have same x or y coords
+  // inner two loops for jj=1,Jnum and kk=jj+1,Jnum insure
+  //   the pair of other 2 non-minimum-x atoms is only considered once
+  // triplet geometry criteria for calculation:
+  //   each pair distance <= cutoff
+  //   produce of 3 pair distances <= cutoff_triple^3
 
   for (ii = 0; ii < inum; ii++) {
     i = ilist[ii];
@@ -112,12 +118,10 @@ void PairATM::compute(int eflag, int vflag)
     jnum = numneigh[i];
     jnumm1 = jnum - 1;
 
-    //    for (jj = 0; jj < jnumm1; jj++) {
-    // replace with this line:
-    for (jj = 0; jj < jnum; jj++) {
-
+    for (jj = 0; jj < jnumm1; jj++) {
       j = jlist[jj];
       j &= NEIGHMASK;
+
       rij[0] = x[j][0] - xi;
       if (rij[0] < 0.0) continue;
       rij[1] = x[j][1] - yi;
@@ -125,40 +129,33 @@ void PairATM::compute(int eflag, int vflag)
       rij[2] = x[j][2] - zi;
       if (rij[0] == 0.0 and rij[1] == 0.0 and rij[2] < 0.0) continue;
       rij2 = rij[0]*rij[0] + rij[1]*rij[1] + rij[2]*rij[2];
-      count1++;
       if (rij2 > cutoff_squared) continue;
-      count2++;
-
-      //for (kk = jj+1; kk < jnum; kk++) {
-      // replace with these two lines:
-      for (kk = 0; kk < jnum; kk++) {
-        if (kk == jj) continue;
 
+      for (kk = jj+1; kk < jnum; kk++) {
         k = jlist[kk];
         k &= NEIGHMASK;
 
-        rjk[0] = x[k][0] - x[j][0];
-        if (rjk[0] < 0.0) continue;
-        rjk[1] = x[k][1] - x[j][1];
-        if (rjk[0] == 0.0 and rjk[1] < 0.0) continue;
-        rjk[2] = x[k][2] - x[j][2];
-        if (rjk[0] == 0.0 and rjk[1] == 0.0 and rjk[2] < 0.0) continue;
-        rjk2 = rjk[0]*rjk[0] + rjk[1]*rjk[1] + rjk[2]*rjk[2];
-        if (rjk2 > cutoff_squared) continue;
-
         rik[0] = x[k][0] - xi;
+        if (rik[0] < 0.0) continue;
         rik[1] = x[k][1] - yi;
+        if (rik[0] == 0.0 and rik[1] < 0.0) continue;
         rik[2] = x[k][2] - zi;
+        if (rik[0] == 0.0 and rik[1] == 0.0 and rik[2] < 0.0) continue;
         rik2 = rik[0]*rik[0] + rik[1]*rik[1] + rik[2]*rik[2];
         if (rik2 > cutoff_squared) continue;
 
+        rjk[0] = x[k][0] - x[j][0];
+        rjk[1] = x[k][1] - x[j][1];
+        rjk[2] = x[k][2] - x[j][2];
+        rjk2 = rjk[0]*rjk[0] + rjk[1]*rjk[1] + rjk[2]*rjk[2];
+        if (rjk2 > cutoff_squared) continue;
+
         double r6 = rij2*rjk2*rik2;
         if (r6 > cutoff_triple_sixth) continue;
 
         nu_local = nu[type[i]][type[j]][type[k]];
         if (nu_local == 0.0) continue;
 
-        count3++;
         interaction_ddd(nu_local,
                         r6,rij2,rik2,rjk2,rij,rik,rjk,fj,fk,eflag,evdwl);
 
@@ -177,15 +174,6 @@ void PairATM::compute(int eflag, int vflag)
     }
   }
 
-  int count = count1;
-  MPI_Allreduce(&count,&count1,1,MPI_INT,MPI_SUM,world);
-  count = count2;
-  MPI_Allreduce(&count,&count2,1,MPI_INT,MPI_SUM,world);
-  count = count3;
-  MPI_Allreduce(&count,&count3,1,MPI_INT,MPI_SUM,world);
-  printf("FORCE %g %d %d %d\n",cutoff_squared,count1,count2,count3);
-
-
   if (vflag_fdotr) virial_fdotr_compute();
 }