From a243be2dc9c25e7fc175d899b2f2d796e9cb7bd6 Mon Sep 17 00:00:00 2001
From: Aidan Thompson <athomps@sandia.gov>
Date: Sun, 21 Apr 2019 22:10:03 -0600
Subject: [PATCH 01/21] Added bare-bones yarray algorithm, 2x speedup

---
 src/SNAP/pair_snap.cpp |  84 ++++++++++++++++------------
 src/SNAP/sna.cpp       | 124 +++++++++++++++++++++++++++++++++++++++++
 src/SNAP/sna.h         |   3 +
 3 files changed, 174 insertions(+), 37 deletions(-)

diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp
index 8b547e6e73..73faaa71f7 100644
--- a/src/SNAP/pair_snap.cpp
+++ b/src/SNAP/pair_snap.cpp
@@ -158,9 +158,12 @@ PairSNAP::~PairSNAP()
 
 void PairSNAP::compute(int eflag, int vflag)
 {
-  if (use_optimized)
-    compute_optimized(eflag, vflag);
-  else
+//   if (use_optimized)
+//     compute_optimized(eflag, vflag);
+//   else
+
+// hard-code compute_regular()
+ 
     compute_regular(eflag, vflag);
 }
 
@@ -248,51 +251,58 @@ void PairSNAP::compute_regular(int eflag, int vflag)
 
     double* coeffi = coeffelem[ielem];
 
+    // omit beta0 from beta vector
+
+    double* beta = coeffi+1; 
+    snaptr->compute_yi(beta);
+
     for (int jj = 0; jj < ninside; jj++) {
       int j = snaptr->inside[jj];
       snaptr->compute_duidrj(snaptr->rij[jj],
                              snaptr->wj[jj],snaptr->rcutij[jj]);
 
-      snaptr->compute_dbidrj();
-      snaptr->copy_dbi2dbvec();
+//       snaptr->compute_dbidrj();
+//       snaptr->copy_dbi2dbvec();
 
-      fij[0] = 0.0;
-      fij[1] = 0.0;
-      fij[2] = 0.0;
+//       fij[0] = 0.0;
+//       fij[1] = 0.0;
+//       fij[2] = 0.0;
 
-      // linear contributions
+//       // linear contributions
 
-      for (int k = 1; k <= ncoeff; k++) {
-        double bgb = coeffi[k];
-        fij[0] += bgb*snaptr->dbvec[k-1][0];
-        fij[1] += bgb*snaptr->dbvec[k-1][1];
-        fij[2] += bgb*snaptr->dbvec[k-1][2];
-      }
+//       for (int k = 1; k <= ncoeff; k++) {
+//         double bgb = coeffi[k];
+//         fij[0] += bgb*snaptr->dbvec[k-1][0];
+//         fij[1] += bgb*snaptr->dbvec[k-1][1];
+//         fij[2] += bgb*snaptr->dbvec[k-1][2];
+//       }
 
-      // quadratic contributions
+//       // quadratic contributions
 
-      if (quadraticflag) {
-        int k = ncoeff+1;
-        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          double bveci = snaptr->bvec[icoeff];
-          double fack = coeffi[k]*bveci;
-          double* dbveci = snaptr->dbvec[icoeff];
-          fij[0] += fack*dbveci[0];
-          fij[1] += fack*dbveci[1];
-          fij[2] += fack*dbveci[2];
-          k++;
-          for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-            double facki = coeffi[k]*bveci;
-            double fackj = coeffi[k]*snaptr->bvec[jcoeff];
-            double* dbvecj = snaptr->dbvec[jcoeff];
+//       if (quadraticflag) {
+//         int k = ncoeff+1;
+//         for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+//           double bveci = snaptr->bvec[icoeff];
+//           double fack = coeffi[k]*bveci;
+//           double* dbveci = snaptr->dbvec[icoeff];
+//           fij[0] += fack*dbveci[0];
+//           fij[1] += fack*dbveci[1];
+//           fij[2] += fack*dbveci[2];
+//           k++;
+//           for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+//             double facki = coeffi[k]*bveci;
+//             double fackj = coeffi[k]*snaptr->bvec[jcoeff];
+//             double* dbvecj = snaptr->dbvec[jcoeff];
 
-            fij[0] += facki*dbvecj[0]+fackj*dbveci[0];
-            fij[1] += facki*dbvecj[1]+fackj*dbveci[1];
-            fij[2] += facki*dbvecj[2]+fackj*dbveci[2];
-            k++;
-          }
-        }
-      }
+//             fij[0] += facki*dbvecj[0]+fackj*dbveci[0];
+//             fij[1] += facki*dbvecj[1]+fackj*dbveci[1];
+//             fij[2] += facki*dbvecj[2]+fackj*dbveci[2];
+//             k++;
+//           }
+//         }
+//       }
+
+      snaptr->compute_deidrj(fij);
 
       f[i][0] += fij[0];
       f[i][1] += fij[1];
diff --git a/src/SNAP/sna.cpp b/src/SNAP/sna.cpp
index 7ed1bc1e23..d30d94dc9d 100644
--- a/src/SNAP/sna.cpp
+++ b/src/SNAP/sna.cpp
@@ -522,6 +522,124 @@ void SNA::compute_zi_omp(int sub_threads)
   }
 }
 
+/* ----------------------------------------------------------------------
+   compute Yi by summing over products of beta and Zi
+------------------------------------------------------------------------- */
+
+void SNA::compute_yi(double* beta)
+{
+  int j;
+  int idxz_count;
+  double **jjjzarray_r, **jjjzarray_i;
+
+  for(int j = 0; j <= twojmax; j++) {
+    for(int mb = 0; 2*mb <= j; mb++)
+      for(int ma = 0; ma <= j; ma++) {
+        yarray_r[j][ma][mb] = 0.0;
+        yarray_i[j][ma][mb] = 0.0;
+      } // end loop over ma, mb
+  } // end loop over j
+
+  for(int JJ = 0; JJ < idxj_max; JJ++) {
+    const int j1 = idxj[JJ].j1;
+    const int j2 = idxj[JJ].j2;
+    const int j3 = idxj[JJ].j;
+
+    j = j3;
+    jjjzarray_r = zarray_r[j1][j2][j3];
+    jjjzarray_i = zarray_i[j1][j2][j3];
+    for(int mb = 0; 2*mb <= j; mb++)
+      for(int ma = 0; ma <= j; ma++) {
+        yarray_r[j][ma][mb] += beta[JJ]*jjjzarray_r[ma][mb];
+        yarray_i[j][ma][mb] += beta[JJ]*jjjzarray_i[ma][mb];
+      } // end loop over ma, mb
+
+    j = j1;
+    jjjzarray_r = zarray_r[j3][j2][j1];
+    jjjzarray_i = zarray_i[j3][j2][j1];
+    double j1fac = (j3+1)/(j+1.0);
+    for(int mb = 0; 2*mb <= j; mb++)
+      for(int ma = 0; ma <= j; ma++) {
+        yarray_r[j][ma][mb] += beta[JJ]*jjjzarray_r[ma][mb]*j1fac;
+        yarray_i[j][ma][mb] += beta[JJ]*jjjzarray_i[ma][mb]*j1fac;
+      } // end loop over ma, mb
+
+    j = j2;
+    jjjzarray_r = zarray_r[j3][j1][j2];
+    jjjzarray_i = zarray_i[j3][j1][j2];
+    double j2fac = (j3+1)/(j+1.0);
+    for(int mb = 0; 2*mb <= j; mb++)
+      for(int ma = 0; ma <= j; ma++) {
+        yarray_r[j][ma][mb] += beta[JJ]*jjjzarray_r[ma][mb]*j2fac;
+        yarray_i[j][ma][mb] += beta[JJ]*jjjzarray_i[ma][mb]*j2fac;
+      } // end loop over ma, mb
+
+  } // end loop over jjb
+
+}
+
+/* ----------------------------------------------------------------------
+   compute dEidRj
+------------------------------------------------------------------------- */
+
+void SNA::compute_deidrj(double* dedr)
+{
+
+  for(int k = 0; k < 3; k++)
+    dedr[k] = 0.0;
+
+  for(int j = 0; j <= twojmax; j++) {
+
+    for(int mb = 0; 2*mb < j; mb++)
+      for(int ma = 0; ma <= j; ma++) {
+
+        double* dudr_r = duarray_r[j][ma][mb];
+        double* dudr_i = duarray_i[j][ma][mb];
+        double jjjmambyarray_r = yarray_r[j][ma][mb];
+        double jjjmambyarray_i = yarray_i[j][ma][mb];
+        for(int k = 0; k < 3; k++)
+          dedr[k] +=
+            dudr_r[k] * jjjmambyarray_r +
+            dudr_i[k] * jjjmambyarray_i;
+
+      } //end loop over ma mb
+
+    // For j even, handle middle column
+
+    if (j%2 == 0) {
+
+      int mb = j/2;
+      for(int ma = 0; ma < mb; ma++) {
+        double* dudr_r = duarray_r[j][ma][mb];
+        double* dudr_i = duarray_i[j][ma][mb];
+        double jjjmambyarray_r = yarray_r[j][ma][mb];
+        double jjjmambyarray_i = yarray_i[j][ma][mb];
+        for(int k = 0; k < 3; k++)
+          dedr[k] +=
+            dudr_r[k] * jjjmambyarray_r +
+            dudr_i[k] * jjjmambyarray_i;
+
+      }
+
+      int ma = mb;
+      double* dudr_r = duarray_r[j][ma][mb];
+      double* dudr_i = duarray_i[j][ma][mb];
+      double jjjmambyarray_r = yarray_r[j][ma][mb];
+      double jjjmambyarray_i = yarray_i[j][ma][mb];
+      for(int k = 0; k < 3; k++)
+        dedr[k] += 
+          (dudr_r[k] * jjjmambyarray_r +
+           dudr_i[k] * jjjmambyarray_i)*0.5;
+        
+    } // end if jeven
+
+  } // End loop over j
+
+  for(int k = 0; k < 3; k++)
+    dedr[k] *= 2.0;
+
+}
+
 /* ----------------------------------------------------------------------
    compute Bi by summing conj(Ui)*Zi
 ------------------------------------------------------------------------- */
@@ -1535,6 +1653,10 @@ void SNA::create_twojmax_arrays()
                    "sna:uarraytot");
     memory->create(zarray_i, jdim, jdim, jdim, jdim, jdim,
                    "sna:zarray");
+  memory->create(yarray_r, jdim, jdim, jdim,
+                 "sna:yarray");
+  memory->create(yarray_i, jdim, jdim, jdim,
+                 "sna:yarray");
   }
 
 }
@@ -1563,6 +1685,8 @@ void SNA::destroy_twojmax_arrays()
     memory->destroy(zarray_r);
     memory->destroy(uarraytot_i);
     memory->destroy(zarray_i);
+    memory->destroy(yarray_r);
+    memory->destroy(yarray_i);
   }
 }
 
diff --git a/src/SNAP/sna.h b/src/SNAP/sna.h
index d05ad0fb84..2c90da1d30 100644
--- a/src/SNAP/sna.h
+++ b/src/SNAP/sna.h
@@ -47,6 +47,7 @@ public:
   void compute_ui_omp(int, int);
   void compute_zi();
   void compute_zi_omp(int);
+  void compute_yi(double*);
   void compute_bi();
   void copy_bi2bvec();
 
@@ -54,6 +55,7 @@ public:
 
   void compute_duidrj(double*, double, double);
   void compute_dbidrj();
+  void compute_deidrj(double*);
   void compute_dbidrj_nonsymm();
   void copy_dbi2dbvec();
   double compute_sfac(double, double);
@@ -80,6 +82,7 @@ public:
   int twojmax, diagonalstyle;
   double*** uarraytot_r, *** uarraytot_i;
   double***** zarray_r, ***** zarray_i;
+  double*** yarray_r, *** yarray_i;
   double*** uarraytot_r_b, *** uarraytot_i_b;
   double***** zarray_r_b, ***** zarray_i_b;
   double*** uarray_r, *** uarray_i;

From 6d84bd6138fedd3738979e791244ccc4f38b6606 Mon Sep 17 00:00:00 2001
From: Aidan Thompson <athomps@sandia.gov>
Date: Fri, 10 May 2019 10:34:01 -0600
Subject: [PATCH 02/21] Added compute_beta()

---
 src/SNAP/pair_snap.cpp | 43 +++++++++++++++++++++---------------------
 src/SNAP/pair_snap.h   |  3 +++
 src/SNAP/sna.cpp       |  8 ++++----
 3 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp
index 73faaa71f7..268e5663da 100644
--- a/src/SNAP/pair_snap.cpp
+++ b/src/SNAP/pair_snap.cpp
@@ -116,6 +116,7 @@ PairSNAP::~PairSNAP()
     memory->destroy(radelem);
     memory->destroy(wjelem);
     memory->destroy(coeffelem);
+    memory->destroy(beta);
   }
 
   // Need to set this because restart not handled by PairHybrid
@@ -246,14 +247,15 @@ void PairSNAP::compute_regular(int eflag, int vflag)
     }
 
     // for neighbors of I within cutoff:
-    // compute dUi/drj and dBi/drj
-    // Fij = dEi/dRj = -dEi/dRi => add to Fi, subtract from Fj
+    // compute Fij = dEi/dRj = -dEi/dRi 
+    // add to Fi, subtract from Fj
 
-    double* coeffi = coeffelem[ielem];
+    // compute dE_i/dB_i = beta_i
 
-    // omit beta0 from beta vector
+    compute_betai(ielem);
+
+    // compute beta_i*Z_i = Y_i
 
-    double* beta = coeffi+1; 
     snaptr->compute_yi(beta);
 
     for (int jj = 0; jj < ninside; jj++) {
@@ -261,22 +263,6 @@ void PairSNAP::compute_regular(int eflag, int vflag)
       snaptr->compute_duidrj(snaptr->rij[jj],
                              snaptr->wj[jj],snaptr->rcutij[jj]);
 
-//       snaptr->compute_dbidrj();
-//       snaptr->copy_dbi2dbvec();
-
-//       fij[0] = 0.0;
-//       fij[1] = 0.0;
-//       fij[2] = 0.0;
-
-//       // linear contributions
-
-//       for (int k = 1; k <= ncoeff; k++) {
-//         double bgb = coeffi[k];
-//         fij[0] += bgb*snaptr->dbvec[k-1][0];
-//         fij[1] += bgb*snaptr->dbvec[k-1][1];
-//         fij[2] += bgb*snaptr->dbvec[k-1][2];
-//       }
-
 //       // quadratic contributions
 
 //       if (quadraticflag) {
@@ -326,6 +312,7 @@ void PairSNAP::compute_regular(int eflag, int vflag)
 
       // evdwl = energy of atom I, sum over coeffs_k * Bi_k
 
+      double* coeffi = coeffelem[ielem];
       evdwl = coeffi[0];
       if (!quadraticflag) {
         snaptr->compute_bi();
@@ -1306,6 +1293,18 @@ void PairSNAP::build_per_atom_arrays()
 #endif
 }
 
+/* ----------------------------------------------------------------------
+   compute beta_i 
+------------------------------------------------------------------------- */
+
+void PairSNAP::compute_betai(int ielem)
+{
+  double* coeffi = coeffelem[ielem];
+
+  for (int k = 1; k <= ncoeff; k++)
+    beta[k-1] = coeffi[k];
+}
+
 /* ----------------------------------------------------------------------
    allocate all arrays
 ------------------------------------------------------------------------- */
@@ -1434,6 +1433,7 @@ void PairSNAP::coeff(int narg, char **arg)
     memory->destroy(radelem);
     memory->destroy(wjelem);
     memory->destroy(coeffelem);
+    memory->destroy(beta);
   }
 
   char* type1 = arg[0];
@@ -1631,6 +1631,7 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename)
   memory->create(radelem,nelements,"pair:radelem");
   memory->create(wjelem,nelements,"pair:wjelem");
   memory->create(coeffelem,nelements,ncoeffall,"pair:coeffelem");
+  memory->create(beta,ncoeffall,"pair:beta");
 
   // Loop over nelements blocks in the SNAP coefficient file
 
diff --git a/src/SNAP/pair_snap.h b/src/SNAP/pair_snap.h
index 1fa065755c..33d1fb8bc9 100644
--- a/src/SNAP/pair_snap.h
+++ b/src/SNAP/pair_snap.h
@@ -55,6 +55,8 @@ protected:
   void set_sna_to_shared(int snaid,int i);
   void build_per_atom_arrays();
 
+  void compute_betai(int);
+
   int schedule_user;
   double schedule_time_guided;
   double schedule_time_dynamic;
@@ -99,6 +101,7 @@ protected:
   double *radelem;              // element radii
   double *wjelem;               // elements weights
   double **coeffelem;           // element bispectrum coefficients
+  double* beta;                 // beta for current atom
   int *map;                     // mapping from atom types to elements
   int twojmax, diagonalstyle, switchflag, bzeroflag;
   double rfac0, rmin0, wj1, wj2;
diff --git a/src/SNAP/sna.cpp b/src/SNAP/sna.cpp
index d30d94dc9d..b729e4d0d6 100644
--- a/src/SNAP/sna.cpp
+++ b/src/SNAP/sna.cpp
@@ -1653,10 +1653,10 @@ void SNA::create_twojmax_arrays()
                    "sna:uarraytot");
     memory->create(zarray_i, jdim, jdim, jdim, jdim, jdim,
                    "sna:zarray");
-  memory->create(yarray_r, jdim, jdim, jdim,
-                 "sna:yarray");
-  memory->create(yarray_i, jdim, jdim, jdim,
-                 "sna:yarray");
+    memory->create(yarray_r, jdim, jdim, jdim,
+                   "sna:yarray");
+    memory->create(yarray_i, jdim, jdim, jdim,
+                   "sna:yarray");
   }
 
 }

From a1f421cd5400de9cb0c486cd7b32e936f09c9eb0 Mon Sep 17 00:00:00 2001
From: Aidan Thompson <athomps@sandia.gov>
Date: Sat, 11 May 2019 12:41:54 -0600
Subject: [PATCH 03/21] Moved compute_beta outside of main force loop

---
 src/SNAP/pair_snap.cpp | 37 +++++++++++++++++++++++++------------
 src/SNAP/pair_snap.h   |  5 +++--
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp
index 268e5663da..0bf367b5dc 100644
--- a/src/SNAP/pair_snap.cpp
+++ b/src/SNAP/pair_snap.cpp
@@ -86,7 +86,7 @@ PairSNAP::PairSNAP(LAMMPS *lmp) : Pair(lmp)
   i_uarraytot_r = NULL;
   i_uarraytot_i = NULL;
   i_zarray_r = NULL;
-  i_zarray_i =NULL;
+  i_zarray_i = NULL;
 
   use_shared_arrays = 0;
 
@@ -101,6 +101,7 @@ PairSNAP::PairSNAP(LAMMPS *lmp) : Pair(lmp)
 
   sna = NULL;
 
+  beta_max = 0;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -189,6 +190,15 @@ void PairSNAP::compute_regular(int eflag, int vflag)
   int newton_pair = force->newton_pair;
   class SNA* snaptr = sna[0];
 
+  if (beta_max < list->inum) {
+    memory->grow(beta,list->inum,ncoeff,"PairSNAP:beta");
+    beta_max = list->inum;
+  }
+
+  // compute dE_i/dB_i = beta_i for all i in list
+
+  compute_beta();
+
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
 
@@ -250,13 +260,9 @@ void PairSNAP::compute_regular(int eflag, int vflag)
     // compute Fij = dEi/dRj = -dEi/dRi 
     // add to Fi, subtract from Fj
 
-    // compute dE_i/dB_i = beta_i
-
-    compute_betai(ielem);
-
     // compute beta_i*Z_i = Y_i
 
-    snaptr->compute_yi(beta);
+    snaptr->compute_yi(beta[ii]);
 
     for (int jj = 0; jj < ninside; jj++) {
       int j = snaptr->inside[jj];
@@ -1294,15 +1300,23 @@ void PairSNAP::build_per_atom_arrays()
 }
 
 /* ----------------------------------------------------------------------
-   compute beta_i 
+   compute beta
 ------------------------------------------------------------------------- */
 
-void PairSNAP::compute_betai(int ielem)
+void PairSNAP::compute_beta()
 {
-  double* coeffi = coeffelem[ielem];
+  int i;
+  int *type = atom->type;
 
-  for (int k = 1; k <= ncoeff; k++)
-    beta[k-1] = coeffi[k];
+  for (int ii = 0; ii < list->inum; ii++) {
+    i = list->ilist[ii];
+    const int itype = type[i];
+    const int ielem = map[itype];
+    double* coeffi = coeffelem[ielem];
+
+    for (int k = 1; k <= ncoeff; k++)
+      beta[ii][k-1] = coeffi[k];
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -1631,7 +1645,6 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename)
   memory->create(radelem,nelements,"pair:radelem");
   memory->create(wjelem,nelements,"pair:wjelem");
   memory->create(coeffelem,nelements,ncoeffall,"pair:coeffelem");
-  memory->create(beta,ncoeffall,"pair:beta");
 
   // Loop over nelements blocks in the SNAP coefficient file
 
diff --git a/src/SNAP/pair_snap.h b/src/SNAP/pair_snap.h
index 33d1fb8bc9..94d21162e2 100644
--- a/src/SNAP/pair_snap.h
+++ b/src/SNAP/pair_snap.h
@@ -55,7 +55,7 @@ protected:
   void set_sna_to_shared(int snaid,int i);
   void build_per_atom_arrays();
 
-  void compute_betai(int);
+  void compute_beta();
 
   int schedule_user;
   double schedule_time_guided;
@@ -101,11 +101,12 @@ protected:
   double *radelem;              // element radii
   double *wjelem;               // elements weights
   double **coeffelem;           // element bispectrum coefficients
-  double* beta;                 // beta for current atom
+  double** beta;                // betas for all atoms in list
   int *map;                     // mapping from atom types to elements
   int twojmax, diagonalstyle, switchflag, bzeroflag;
   double rfac0, rmin0, wj1, wj2;
   int rcutfacflag, twojmaxflag; // flags for required parameters
+  int beta_max;                 // length of beta
 };
 
 }

From e13c661f774ea40f39b042f46d540c703fb88c5d Mon Sep 17 00:00:00 2001
From: Aidan Thompson <athomps@sandia.gov>
Date: Sat, 11 May 2019 12:54:18 -0600
Subject: [PATCH 04/21] Added placeholder for neural network SNAP potential

---
 src/SNAP/pair_nn_snap.cpp | 1824 +++++++++++++++++++++++++++++++++++++
 src/SNAP/pair_nn_snap.h   |  184 ++++
 2 files changed, 2008 insertions(+)
 create mode 100644 src/SNAP/pair_nn_snap.cpp
 create mode 100644 src/SNAP/pair_nn_snap.h

diff --git a/src/SNAP/pair_nn_snap.cpp b/src/SNAP/pair_nn_snap.cpp
new file mode 100644
index 0000000000..e90f6d6b1b
--- /dev/null
+++ b/src/SNAP/pair_nn_snap.cpp
@@ -0,0 +1,1824 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include "pair_nn_snap.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "force.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "sna.h"
+#include "openmp_snap.h"
+#include "domain.h"
+#include "memory.h"
+#include "error.h"
+
+#include <cmath>
+
+using namespace LAMMPS_NS;
+
+#define MAXLINE 1024
+#define MAXWORD 3
+
+// Outstanding issues with quadratic term
+// 1. there seems to a problem with compute_optimized energy calc
+// it does not match compute_regular, even when quadratic coeffs = 0
+
+/* ---------------------------------------------------------------------- */
+
+PairNNSNAP::PairNNSNAP(LAMMPS *lmp) : Pair(lmp)
+{
+  single_enable = 0;
+  restartinfo = 0;
+  one_coeff = 1;
+  manybody_flag = 1;
+
+  nelements = 0;
+  elements = NULL;
+  radelem = NULL;
+  wjelem = NULL;
+  coeffelem = NULL;
+
+  nmax = 0;
+  nthreads = 1;
+
+  schedule_user = 0;
+  schedule_time_guided = -1;
+  schedule_time_dynamic = -1;
+  ncalls_neigh =-1;
+
+  ilistmask_max = 0;
+  ilistmask = NULL;
+  ghostinum = 0;
+  ghostilist_max = 0;
+  ghostilist = NULL;
+  ghostnumneigh_max = 0;
+  ghostnumneigh = NULL;
+  ghostneighs = NULL;
+  ghostfirstneigh = NULL;
+  ghostneighs_total = 0;
+  ghostneighs_max = 0;
+
+  i_max = 0;
+  i_neighmax = 0;
+  i_numpairs = 0;
+  i_rij = NULL;
+  i_inside = NULL;
+  i_wj = NULL;
+  i_rcutij = NULL;
+  i_ninside = NULL;
+  i_pairs = NULL;
+  i_uarraytot_r = NULL;
+  i_uarraytot_i = NULL;
+  i_zarray_r = NULL;
+  i_zarray_i = NULL;
+
+  use_shared_arrays = 0;
+
+#ifdef TIMING_INFO
+  timers[0] = 0;
+  timers[1] = 0;
+  timers[2] = 0;
+  timers[3] = 0;
+#endif
+
+  // Need to set this because restart not handled by PairHybrid
+
+  sna = NULL;
+
+  beta_max = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairNNSNAP::~PairNNSNAP()
+{
+  if (copymode) return;
+
+  if (nelements) {
+    for (int i = 0; i < nelements; i++)
+      delete[] elements[i];
+    delete[] elements;
+    memory->destroy(radelem);
+    memory->destroy(wjelem);
+    memory->destroy(coeffelem);
+    memory->destroy(beta);
+  }
+
+  // Need to set this because restart not handled by PairHybrid
+
+  if (sna) {
+
+#ifdef TIMING_INFO
+    double time[5];
+    double timeave[5];
+    double timeave_mpi[5];
+    double timemax_mpi[5];
+
+    for (int i = 0; i < 5; i++) {
+      time[i] = 0;
+      timeave[i] = 0;
+      for (int tid = 0; tid<nthreads; tid++) {
+        if (sna[tid]->timers[i]>time[i])
+          time[i] = sna[tid]->timers[i];
+        timeave[i] += sna[tid]->timers[i];
+      }
+      timeave[i] /= nthreads;
+    }
+    MPI_Reduce(timeave, timeave_mpi, 5, MPI_DOUBLE, MPI_SUM, 0, world);
+    MPI_Reduce(time, timemax_mpi, 5, MPI_DOUBLE, MPI_MAX, 0, world);
+#endif
+
+    for (int tid = 0; tid<nthreads; tid++)
+      delete sna[tid];
+    delete [] sna;
+
+  }
+
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cutsq);
+    memory->destroy(map);
+  }
+
+}
+
+void PairNNSNAP::compute(int eflag, int vflag)
+{
+//   if (use_optimized)
+//     compute_optimized(eflag, vflag);
+//   else
+
+// hard-code compute_regular()
+ 
+    compute_regular(eflag, vflag);
+}
+
+/* ----------------------------------------------------------------------
+   This version is a straightforward implementation
+   ---------------------------------------------------------------------- */
+
+void PairNNSNAP::compute_regular(int eflag, int vflag)
+{
+  int i,j,jnum,ninside;
+  double delx,dely,delz,evdwl,rsq;
+  double fij[3];
+  int *jlist,*numneigh,**firstneigh;
+  evdwl = 0.0;
+
+  ev_init(eflag,vflag);
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int newton_pair = force->newton_pair;
+  class SNA* snaptr = sna[0];
+
+  if (beta_max < list->inum) {
+    memory->grow(beta,list->inum,ncoeff,"PairNNSNAP:beta");
+    beta_max = list->inum;
+  }
+
+  // compute dE_i/dB_i = beta_i for all i in list
+
+  compute_beta();
+
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  for (int ii = 0; ii < list->inum; ii++) {
+    i = list->ilist[ii];
+
+    const double xtmp = x[i][0];
+    const double ytmp = x[i][1];
+    const double ztmp = x[i][2];
+    const int itype = type[i];
+    const int ielem = map[itype];
+    const double radi = radelem[ielem];
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    // insure rij, inside, wj, and rcutij are of size jnum
+
+    snaptr->grow_rij(jnum);
+
+    // rij[][3] = displacements between atom I and those neighbors
+    // inside = indices of neighbors of I within cutoff
+    // wj = weights for neighbors of I within cutoff
+    // rcutij = cutoffs for neighbors of I within cutoff
+    // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi
+
+    ninside = 0;
+    for (int jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      delx = x[j][0] - xtmp;
+      dely = x[j][1] - ytmp;
+      delz = x[j][2] - ztmp;
+      rsq = delx*delx + dely*dely + delz*delz;
+      int jtype = type[j];
+      int jelem = map[jtype];
+
+      if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
+        snaptr->rij[ninside][0] = delx;
+        snaptr->rij[ninside][1] = dely;
+        snaptr->rij[ninside][2] = delz;
+        snaptr->inside[ninside] = j;
+        snaptr->wj[ninside] = wjelem[jelem];
+        snaptr->rcutij[ninside] = (radi + radelem[jelem])*rcutfac;
+        ninside++;
+      }
+    }
+
+    // compute Ui, Zi, and Bi for atom I
+
+    snaptr->compute_ui(ninside);
+    snaptr->compute_zi();
+    if (quadraticflag) {
+      snaptr->compute_bi();
+      snaptr->copy_bi2bvec();
+    }
+
+    // for neighbors of I within cutoff:
+    // compute Fij = dEi/dRj = -dEi/dRi 
+    // add to Fi, subtract from Fj
+
+    // compute beta_i*Z_i = Y_i
+
+    snaptr->compute_yi(beta[ii]);
+
+    for (int jj = 0; jj < ninside; jj++) {
+      int j = snaptr->inside[jj];
+      snaptr->compute_duidrj(snaptr->rij[jj],
+                             snaptr->wj[jj],snaptr->rcutij[jj]);
+
+//       // quadratic contributions
+
+//       if (quadraticflag) {
+//         int k = ncoeff+1;
+//         for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+//           double bveci = snaptr->bvec[icoeff];
+//           double fack = coeffi[k]*bveci;
+//           double* dbveci = snaptr->dbvec[icoeff];
+//           fij[0] += fack*dbveci[0];
+//           fij[1] += fack*dbveci[1];
+//           fij[2] += fack*dbveci[2];
+//           k++;
+//           for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+//             double facki = coeffi[k]*bveci;
+//             double fackj = coeffi[k]*snaptr->bvec[jcoeff];
+//             double* dbvecj = snaptr->dbvec[jcoeff];
+
+//             fij[0] += facki*dbvecj[0]+fackj*dbveci[0];
+//             fij[1] += facki*dbvecj[1]+fackj*dbveci[1];
+//             fij[2] += facki*dbvecj[2]+fackj*dbveci[2];
+//             k++;
+//           }
+//         }
+//       }
+
+      snaptr->compute_deidrj(fij);
+
+      f[i][0] += fij[0];
+      f[i][1] += fij[1];
+      f[i][2] += fij[2];
+      f[j][0] -= fij[0];
+      f[j][1] -= fij[1];
+      f[j][2] -= fij[2];
+
+      // tally per-atom virial contribution
+
+      if (vflag)
+        ev_tally_xyz(i,j,nlocal,newton_pair,0.0,0.0,
+                     fij[0],fij[1],fij[2],
+                     -snaptr->rij[jj][0],-snaptr->rij[jj][1],
+                     -snaptr->rij[jj][2]);
+    }
+
+    // tally energy contribution
+
+    if (eflag) {
+
+      // evdwl = energy of atom I, sum over coeffs_k * Bi_k
+
+      double* coeffi = coeffelem[ielem];
+      evdwl = coeffi[0];
+      if (!quadraticflag) {
+        snaptr->compute_bi();
+        snaptr->copy_bi2bvec();
+      }
+
+      // E = beta.B + 0.5*B^t.alpha.B
+      // coeff[k] = beta[k-1] or
+      // coeff[k] = alpha_ii or
+      // coeff[k] = alpha_ij = alpha_ji, j != i
+
+      // linear contributions
+
+      for (int k = 1; k <= ncoeff; k++)
+        evdwl += coeffi[k]*snaptr->bvec[k-1];
+
+      // quadratic contributions
+
+      if (quadraticflag) {
+        int k = ncoeff+1;
+        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+          double bveci = snaptr->bvec[icoeff];
+          evdwl += 0.5*coeffi[k++]*bveci*bveci;
+          for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+            evdwl += coeffi[k++]*bveci*snaptr->bvec[jcoeff];
+          }
+        }
+      }
+      ev_tally_full(i,2.0*evdwl,0.0,0.0,0.0,0.0,0.0);
+    }
+
+  }
+
+  if (vflag_fdotr) virial_fdotr_compute();
+}
+
+
+/* ----------------------------------------------------------------------
+   This version is optimized for threading, micro-load balancing
+   ---------------------------------------------------------------------- */
+
+void PairNNSNAP::compute_optimized(int eflag, int vflag)
+{
+  // if reneighboring took place do load_balance if requested
+  if (do_load_balance > 0 &&
+      (neighbor->ncalls != ncalls_neigh)) {
+    ghostinum = 0;
+    // reset local ghost neighbor lists
+    ncalls_neigh = neighbor->ncalls;
+    if (ilistmask_max < list->inum) {
+      memory->grow(ilistmask,list->inum,"PairSnap::ilistmask");
+      ilistmask_max = list->inum;
+    }
+    for (int i = 0; i < list->inum; i++)
+      ilistmask[i] = 1;
+
+    //multiple passes for loadbalancing
+    for (int i = 0; i < do_load_balance; i++)
+      load_balance();
+  }
+
+  int numpairs = 0;
+  for (int ii = 0; ii < list->inum; ii++) {
+    if ((do_load_balance <= 0) || ilistmask[ii]) {
+      int i = list->ilist[ii];
+      int jnum = list->numneigh[i];
+      numpairs += jnum;
+    }
+  }
+
+  if (do_load_balance)
+    for (int ii = 0; ii < ghostinum; ii++) {
+      int i = ghostilist[ii];
+      int jnum = ghostnumneigh[i];
+      numpairs += jnum;
+    }
+
+  // optimized schedule setting
+
+  int time_dynamic = 0;
+  int time_guided = 0;
+
+  if (schedule_user == 0) schedule_user = 4;
+
+  switch (schedule_user) {
+  case 1:
+    omp_set_schedule(omp_sched_static,1);
+    break;
+  case 2:
+    omp_set_schedule(omp_sched_dynamic,1);
+    break;
+  case 3:
+    omp_set_schedule(omp_sched_guided,2);
+    break;
+  case 4:
+    omp_set_schedule(omp_sched_auto,0);
+    break;
+  case 5:
+    if (numpairs < 8*nthreads) omp_set_schedule(omp_sched_dynamic,1);
+    else if (schedule_time_guided < 0.0) {
+      omp_set_schedule(omp_sched_guided,2);
+      if (!eflag && !vflag) time_guided = 1;
+    } else if (schedule_time_dynamic<0.0) {
+      omp_set_schedule(omp_sched_dynamic,1);
+      if (!eflag && !vflag) time_dynamic = 1;
+    } else if (schedule_time_guided<schedule_time_dynamic)
+      omp_set_schedule(omp_sched_guided,2);
+    else
+      omp_set_schedule(omp_sched_dynamic,1);
+    break;
+  }
+
+  if (use_shared_arrays)
+    build_per_atom_arrays();
+
+#if defined(_OPENMP)
+#pragma omp parallel shared(eflag,vflag,time_dynamic,time_guided) firstprivate(numpairs) default(none)
+#endif
+  {
+    // begin of pragma omp parallel
+
+    int tid = omp_get_thread_num();
+    int** pairs_tid_unique = NULL;
+
+    int** pairs;
+    if (use_shared_arrays) pairs = i_pairs;
+    else {
+      memory->create(pairs_tid_unique,numpairs,4,"numpairs");
+      pairs = pairs_tid_unique;
+    }
+
+    if (!use_shared_arrays) {
+      numpairs = 0;
+      for (int ii = 0; ii < list->inum; ii++) {
+        if ((do_load_balance <= 0) || ilistmask[ii]) {
+          int i = list->ilist[ii];
+          int jnum = list->numneigh[i];
+          for (int jj = 0; jj<jnum; jj++) {
+            pairs[numpairs][0] = i;
+            pairs[numpairs][1] = jj;
+            pairs[numpairs][2] = -1;
+            numpairs++;
+          }
+        }
+      }
+
+      for (int ii = 0; ii < ghostinum; ii++) {
+        int i = ghostilist[ii];
+        int jnum = ghostnumneigh[i];
+        for (int jj = 0; jj<jnum; jj++) {
+          pairs[numpairs][0] = i;
+          pairs[numpairs][1] = jj;
+          pairs[numpairs][2] = -1;
+          numpairs++;
+        }
+      }
+    }
+
+    int ielem;
+    int jj,k,jnum,jtype,ninside;
+    double delx,dely,delz,evdwl,rsq;
+    double fij[3];
+    int *jlist,*numneigh,**firstneigh;
+    evdwl = 0.0;
+
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+    {
+      ev_init(eflag,vflag);
+    }
+
+#if defined(_OPENMP)
+#pragma omp barrier
+    { ; }
+#endif
+
+    double **x = atom->x;
+    double **f = atom->f;
+    int *type = atom->type;
+    int nlocal = atom->nlocal;
+    int newton_pair = force->newton_pair;
+
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+
+#ifdef TIMING_INFO
+    // only update micro timers after setup
+    static int count=0;
+    if (count<2) {
+      sna[tid]->timers[0] = 0;
+      sna[tid]->timers[1] = 0;
+      sna[tid]->timers[2] = 0;
+      sna[tid]->timers[3] = 0;
+      sna[tid]->timers[4] = 0;
+    }
+    count++;
+#endif
+
+    // did thread start working on interactions of new atom
+    int iold = -1;
+
+    double starttime, endtime;
+    if (time_dynamic || time_guided)
+      starttime = MPI_Wtime();
+
+#if defined(_OPENMP)
+#pragma omp for schedule(runtime)
+#endif
+    for (int iijj = 0; iijj < numpairs; iijj++) {
+      int i = 0;
+      if (use_shared_arrays) {
+        i = i_pairs[iijj][0];
+        if (iold != i) {
+          set_sna_to_shared(tid,i_pairs[iijj][3]);
+          ielem = map[type[i]];
+        }
+        iold = i;
+      } else {
+        i = pairs[iijj][0];
+        if (iold != i) {
+          iold = i;
+          const double xtmp = x[i][0];
+          const double ytmp = x[i][1];
+          const double ztmp = x[i][2];
+          const int itype = type[i];
+          ielem = map[itype];
+          const double radi = radelem[ielem];
+
+          if (i < nlocal) {
+            jlist = firstneigh[i];
+            jnum = numneigh[i];
+          } else {
+            jlist = ghostneighs+ghostfirstneigh[i];
+            jnum = ghostnumneigh[i];
+          }
+
+          // insure rij, inside, wj, and rcutij are of size jnum
+
+          sna[tid]->grow_rij(jnum);
+
+          // rij[][3] = displacements between atom I and those neighbors
+          // inside = indices of neighbors of I within cutoff
+          // wj = weights of neighbors of I within cutoff
+          // rcutij = cutoffs of neighbors of I within cutoff
+          // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi
+
+          ninside = 0;
+          for (jj = 0; jj < jnum; jj++) {
+            int j = jlist[jj];
+            j &= NEIGHMASK;
+            delx = x[j][0] - xtmp; //unitialised
+            dely = x[j][1] - ytmp;
+            delz = x[j][2] - ztmp;
+            rsq = delx*delx + dely*dely + delz*delz;
+            jtype = type[j];
+            int jelem = map[jtype];
+
+            if (rsq < cutsq[itype][jtype]&&rsq>1e-20) { //unitialised
+              sna[tid]->rij[ninside][0] = delx;
+              sna[tid]->rij[ninside][1] = dely;
+              sna[tid]->rij[ninside][2] = delz;
+              sna[tid]->inside[ninside] = j;
+              sna[tid]->wj[ninside] = wjelem[jelem];
+              sna[tid]->rcutij[ninside] = (radi + radelem[jelem])*rcutfac;
+              ninside++;
+
+              // update index list with inside index
+              pairs[iijj + (jj - pairs[iijj][1])][2] =
+                ninside-1; //unitialised
+            }
+          }
+
+          // compute Ui and Zi for atom I
+
+          sna[tid]->compute_ui(ninside); //unitialised
+          sna[tid]->compute_zi();
+        }
+      }
+      if (quadraticflag) {
+        sna[tid]->compute_bi();
+        sna[tid]->copy_bi2bvec();
+      }
+
+      // for neighbors of I within cutoff:
+      // compute dUi/drj and dBi/drj
+      // Fij = dEi/dRj = -dEi/dRi => add to Fi, subtract from Fj
+
+      // entry into loop if inside index is set
+
+      double* coeffi = coeffelem[ielem];
+
+      if (pairs[iijj][2] >= 0) {
+        jj = pairs[iijj][2];
+        int j = sna[tid]->inside[jj];
+        sna[tid]->compute_duidrj(sna[tid]->rij[jj],
+                                 sna[tid]->wj[jj],sna[tid]->rcutij[jj]);
+
+        sna[tid]->compute_dbidrj();
+        sna[tid]->copy_dbi2dbvec();
+
+        fij[0] = 0.0;
+        fij[1] = 0.0;
+        fij[2] = 0.0;
+
+        // linear contributions
+
+        for (k = 1; k <= ncoeff; k++) {
+          double bgb = coeffi[k];
+          fij[0] += bgb*sna[tid]->dbvec[k-1][0];
+          fij[1] += bgb*sna[tid]->dbvec[k-1][1];
+          fij[2] += bgb*sna[tid]->dbvec[k-1][2];
+        }
+
+      // quadratic contributions
+
+      if (quadraticflag) {
+        int k = ncoeff+1;
+        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+          double bveci = sna[tid]->bvec[icoeff];
+          double fack = coeffi[k]*bveci;
+          double* dbveci = sna[tid]->dbvec[icoeff];
+          fij[0] += fack*sna[tid]->dbvec[icoeff][0];
+          fij[1] += fack*sna[tid]->dbvec[icoeff][1];
+          fij[2] += fack*sna[tid]->dbvec[icoeff][2];
+          k++;
+          for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+            double facki = coeffi[k]*bveci;
+            double fackj = coeffi[k]*sna[tid]->bvec[jcoeff];
+            double* dbvecj = sna[tid]->dbvec[jcoeff];
+            fij[0] += facki*dbvecj[0]+fackj*dbveci[0];
+            fij[1] += facki*dbvecj[1]+fackj*dbveci[1];
+            fij[2] += facki*dbvecj[2]+fackj*dbveci[2];
+            k++;
+          }
+        }
+      }
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+        {
+          f[i][0] += fij[0];
+          f[i][1] += fij[1];
+          f[i][2] += fij[2];
+          f[j][0] -= fij[0];
+          f[j][1] -= fij[1];
+          f[j][2] -= fij[2];
+
+          // tally per-atom virial contribution
+
+          if (vflag)
+            ev_tally_xyz(i,j,nlocal,newton_pair,0.0,0.0,
+                         fij[0],fij[1],fij[2],
+                         -sna[tid]->rij[jj][0],-sna[tid]->rij[jj][1],
+                         -sna[tid]->rij[jj][2]);
+        }
+      }
+
+      // evdwl = energy of atom I, sum over coeffs_k * Bi_k
+      // only call this for first pair of each atom i
+      // if atom has no pairs, eatom=0, which is wrong
+
+      if (eflag&&pairs[iijj][1] == 0) {
+        evdwl = coeffi[0];
+
+        if (!quadraticflag) {
+          sna[tid]->compute_bi();
+          sna[tid]->copy_bi2bvec();
+        }
+
+        // E = beta.B + 0.5*B^t.alpha.B
+        // coeff[k] = beta[k-1] or
+        // coeff[k] = alpha_ii or
+        // coeff[k] = alpha_ij = alpha_ji, j != i
+
+        // linear contributions
+
+        for (int k = 1; k <= ncoeff; k++)
+          evdwl += coeffi[k]*sna[tid]->bvec[k-1];
+
+        // quadratic contributions
+
+        if (quadraticflag) {
+          int k = ncoeff+1;
+          for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+            double bveci = sna[tid]->bvec[icoeff];
+            evdwl += 0.5*coeffi[k++]*bveci*bveci;
+            for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+              evdwl += coeffi[k++]*bveci*sna[tid]->bvec[jcoeff];
+            }
+          }
+        }
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+        ev_tally_full(i,2.0*evdwl,0.0,0.0,0.0,0.0,0.0);
+      }
+
+    }
+    if (time_dynamic || time_guided)
+      endtime = MPI_Wtime();
+    if (time_dynamic) schedule_time_dynamic = endtime - starttime;
+    if (time_guided) schedule_time_guided = endtime - starttime;
+    if (!use_shared_arrays) memory->destroy(pairs);
+
+  }// end of pragma omp parallel
+
+  if (vflag_fdotr) virial_fdotr_compute();
+
+}
+
+inline int PairNNSNAP::equal(double* x,double* y)
+{
+  double dist2 =
+    (x[0]-y[0])*(x[0]-y[0]) +
+    (x[1]-y[1])*(x[1]-y[1]) +
+    (x[2]-y[2])*(x[2]-y[2]);
+  if (dist2 < 1e-20) return 1;
+  return 0;
+}
+
+inline double PairNNSNAP::dist2(double* x,double* y)
+{
+  return
+    (x[0]-y[0])*(x[0]-y[0]) +
+    (x[1]-y[1])*(x[1]-y[1]) +
+    (x[2]-y[2])*(x[2]-y[2]);
+}
+
+// return extra communication cutoff
+// extra_cutoff = max(subdomain_length)
+
+double PairNNSNAP::extra_cutoff()
+{
+  double sublo[3],subhi[3];
+
+  if (domain->triclinic == 0) {
+    for (int dim = 0 ; dim < 3 ; dim++) {
+      sublo[dim] = domain->sublo[dim];
+      subhi[dim] = domain->subhi[dim];
+    }
+  } else {
+    domain->lamda2x(domain->sublo_lamda,sublo);
+    domain->lamda2x(domain->subhi_lamda,subhi);
+  }
+
+  double sub_size[3];
+  for (int dim = 0; dim < 3; dim++)
+    sub_size[dim] = subhi[dim] - sublo[dim];
+
+  double max_sub_size = 0;
+  for (int dim = 0; dim < 3; dim++)
+    max_sub_size = MAX(max_sub_size,sub_size[dim]);
+
+  // note: for triclinic, probably need something different
+  // see Comm::setup()
+
+  return max_sub_size;
+}
+
+// micro load_balancer: each MPI process will
+// check with each of its 26 neighbors,
+// whether an imbalance exists in the number
+// of atoms to calculate forces for.
+// If it does it will set ilistmask of one of
+// its local atoms to zero, and send its Tag
+// to the neighbor process. The neighboring process
+// will check its ghost list for the
+// ghost atom with the same Tag which is closest
+// to its domain center, and build a
+// neighborlist for this ghost atom. For this to work,
+// the communication cutoff has to be
+// as large as the neighbor cutoff +
+// maximum subdomain length.
+
+// Note that at most one atom is exchanged per processor pair.
+
+// Also note that the local atom assignment
+// doesn't change. This load balancer will cause
+// some ghost atoms to have full neighborlists
+// which are unique to PairNNSNAP.
+// They are not part of the generally accessible neighborlist.
+// At the same time corresponding local atoms on
+// other MPI processes will not be
+// included in the force computation since
+// their ilistmask is 0. This does not effect
+// any other classes which might
+// access the same general neighborlist.
+// Reverse communication (newton on) of forces is required.
+
+// Currently the load balancer does two passes,
+// since its exchanging atoms upstream and downstream.
+
+void PairNNSNAP::load_balance()
+{
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    double* sublotmp = domain->sublo;
+    double* subhitmp = domain->subhi;
+    for (int dim = 0 ; dim<3 ; dim++) {
+      sublo[dim]=sublotmp[dim];
+      subhi[dim]=subhitmp[dim];
+    }
+  } else {
+    double* sublotmp = domain->sublo_lamda;
+    double* subhitmp = domain->subhi_lamda;
+    domain->lamda2x(sublotmp,sublo);
+    domain->lamda2x(subhitmp,subhi);
+  }
+
+  //if (list->inum==0) list->grow(atom->nmax);
+
+  int nlocal = ghostinum;
+  for (int i=0; i < list->inum; i++)
+    if (ilistmask[i]) nlocal++;
+  int ***grid2proc = comm->grid2proc;
+  int* procgrid = comm->procgrid;
+
+  int nlocal_up,nlocal_down;
+  MPI_Request request;
+
+  double sub_mid[3];
+  for (int dim=0; dim<3; dim++)
+    sub_mid[dim] = (subhi[dim] + sublo[dim])/2;
+
+  if (comm->cutghostuser <
+      neighbor->cutneighmax+extra_cutoff())
+    error->all(FLERR,"Communication cutoff too small for SNAP micro load balancing");
+
+  int nrecv = ghostinum;
+  int totalsend = 0;
+  int nsend = 0;
+  int depth = 1;
+
+  for (int dx = -depth; dx < depth+1; dx++)
+    for (int dy = -depth; dy < depth+1; dy++)
+      for (int dz = -depth; dz < depth+1; dz++) {
+
+        if (dx == dy && dy == dz && dz == 0) continue;
+
+        int sendloc[3] = {comm->myloc[0],
+                          comm->myloc[1], comm->myloc[2]
+                         };
+        sendloc[0] += dx;
+        sendloc[1] += dy;
+        sendloc[2] += dz;
+        for (int dim = 0; dim < 3; dim++)
+          if (sendloc[dim] >= procgrid[dim])
+            sendloc[dim] = sendloc[dim] - procgrid[dim];
+        for (int dim = 0; dim < 3; dim++)
+          if (sendloc[dim] < 0)
+            sendloc[dim] = procgrid[dim] + sendloc[dim];
+        int recvloc[3] = {comm->myloc[0],
+                          comm->myloc[1], comm->myloc[2]
+                         };
+        recvloc[0] -= dx;
+        recvloc[1] -= dy;
+        recvloc[2] -= dz;
+        for (int dim = 0; dim < 3; dim++)
+          if (recvloc[dim] < 0)
+            recvloc[dim] = procgrid[dim] + recvloc[dim];
+        for (int dim = 0; dim < 3; dim++)
+          if (recvloc[dim] >= procgrid[dim])
+            recvloc[dim] = recvloc[dim] - procgrid[dim];
+
+        int sendproc = grid2proc[sendloc[0]][sendloc[1]][sendloc[2]];
+        int recvproc = grid2proc[recvloc[0]][recvloc[1]][recvloc[2]];
+
+        // two stage process, first upstream movement, then downstream
+
+        MPI_Sendrecv(&nlocal,1,MPI_INT,sendproc,0,
+                     &nlocal_up,1,MPI_INT,recvproc,0,world,MPI_STATUS_IGNORE);
+        MPI_Sendrecv(&nlocal,1,MPI_INT,recvproc,0,
+                     &nlocal_down,1,MPI_INT,sendproc,0,world,MPI_STATUS_IGNORE);
+        nsend = 0;
+
+        // send upstream
+
+        if (nlocal > nlocal_up+1) {
+
+          int i = totalsend++;
+          while(i < list->inum && ilistmask[i] == 0)
+            i = totalsend++;
+
+          if (i < list->inum)
+            MPI_Isend(&atom->tag[i],1,MPI_INT,recvproc,0,world,&request);
+          else {
+            int j = -1;
+            MPI_Isend(&j,1,MPI_INT,recvproc,0,world,&request);
+          }
+
+          if (i < list->inum) {
+            for (int j = 0; j < list->inum; j++)
+              if (list->ilist[j] == i)
+                ilistmask[j] = 0;
+            nsend = 1;
+          }
+        }
+
+        // recv downstream
+
+        if (nlocal < nlocal_down-1) {
+          nlocal++;
+          int get_tag = -1;
+          MPI_Recv(&get_tag,1,MPI_INT,sendproc,0,world,MPI_STATUS_IGNORE);
+
+          // if get_tag -1 the other process didnt have local atoms to send
+
+          if (get_tag >= 0) {
+            if (ghostinum >= ghostilist_max) {
+              memory->grow(ghostilist,ghostinum+10,
+                           "PairSnap::ghostilist");
+              ghostilist_max = ghostinum+10;
+            }
+            if (atom->nlocal + atom->nghost >= ghostnumneigh_max) {
+              ghostnumneigh_max = atom->nlocal+atom->nghost+100;
+              memory->grow(ghostnumneigh,ghostnumneigh_max,
+                           "PairSnap::ghostnumneigh");
+              memory->grow(ghostfirstneigh,ghostnumneigh_max,
+                           "PairSnap::ghostfirstneigh");
+            }
+
+            // find closest ghost image of the transfered particle
+
+            double mindist = 1e200;
+            int closestghost = -1;
+            for (int j = 0; j < atom->nlocal + atom->nghost; j++)
+              if (atom->tag[j] == get_tag)
+                if (dist2(sub_mid, atom->x[j]) < mindist) {
+                  closestghost = j;
+                  mindist = dist2(sub_mid, atom->x[j]);
+                }
+
+            // build neighborlist for this particular
+            // ghost atom, and add it to list->ilist
+
+            if (ghostneighs_max - ghostneighs_total <
+                neighbor->oneatom) {
+              memory->grow(ghostneighs,
+                           ghostneighs_total + neighbor->oneatom,
+                           "PairSnap::ghostneighs");
+              ghostneighs_max = ghostneighs_total + neighbor->oneatom;
+            }
+
+            int j = closestghost;
+
+            ghostilist[ghostinum] = j;
+            ghostnumneigh[j] = 0;
+            ghostfirstneigh[j] = ghostneighs_total;
+
+            ghostinum++;
+            int* jlist = ghostneighs + ghostfirstneigh[j];
+
+            // find all neighbors by looping
+            // over all local and ghost atoms
+
+            for (int k = 0; k < atom->nlocal + atom->nghost; k++)
+              if (dist2(atom->x[j],atom->x[k]) <
+                  neighbor->cutneighmax*neighbor->cutneighmax) {
+                jlist[ghostnumneigh[j]] = k;
+                ghostnumneigh[j]++;
+                ghostneighs_total++;
+              }
+          }
+
+          if (get_tag >= 0) nrecv++;
+        }
+
+        // decrease nlocal later, so that it is the
+        // initial number both for receiving and sending
+
+        if (nsend) nlocal--;
+
+        // second pass through the grid
+
+        MPI_Sendrecv(&nlocal,1,MPI_INT,sendproc,0,
+                     &nlocal_up,1,MPI_INT,recvproc,0,world,MPI_STATUS_IGNORE);
+        MPI_Sendrecv(&nlocal,1,MPI_INT,recvproc,0,
+                     &nlocal_down,1,MPI_INT,sendproc,0,world,MPI_STATUS_IGNORE);
+
+        // send downstream
+
+        nsend=0;
+        if (nlocal > nlocal_down+1) {
+          int i = totalsend++;
+          while(i < list->inum && ilistmask[i]==0) i = totalsend++;
+
+          if (i < list->inum)
+            MPI_Isend(&atom->tag[i],1,MPI_INT,sendproc,0,world,&request);
+          else {
+            int j =- 1;
+            MPI_Isend(&j,1,MPI_INT,sendproc,0,world,&request);
+          }
+
+          if (i < list->inum) {
+            for (int j=0; j<list->inum; j++)
+              if (list->ilist[j] == i) ilistmask[j] = 0;
+            nsend = 1;
+          }
+        }
+
+        // receive upstream
+
+        if (nlocal < nlocal_up-1) {
+          nlocal++;
+          int get_tag = -1;
+
+          MPI_Recv(&get_tag,1,MPI_INT,recvproc,0,world,MPI_STATUS_IGNORE);
+
+          if (get_tag >= 0) {
+            if (ghostinum >= ghostilist_max) {
+              memory->grow(ghostilist,ghostinum+10,
+                           "PairSnap::ghostilist");
+              ghostilist_max = ghostinum+10;
+            }
+            if (atom->nlocal + atom->nghost >= ghostnumneigh_max) {
+              ghostnumneigh_max = atom->nlocal + atom->nghost + 100;
+              memory->grow(ghostnumneigh,ghostnumneigh_max,
+                           "PairSnap::ghostnumneigh");
+              memory->grow(ghostfirstneigh,ghostnumneigh_max,
+                           "PairSnap::ghostfirstneigh");
+            }
+
+            // find closest ghost image of the transfered particle
+
+            double mindist = 1e200;
+            int closestghost = -1;
+            for (int j = 0; j < atom->nlocal + atom->nghost; j++)
+              if (atom->tag[j] == get_tag)
+                if (dist2(sub_mid,atom->x[j])<mindist) {
+                  closestghost = j;
+                  mindist = dist2(sub_mid,atom->x[j]);
+                }
+
+            // build neighborlist for this particular ghost atom
+
+            if (ghostneighs_max-ghostneighs_total < neighbor->oneatom) {
+              memory->grow(ghostneighs,ghostneighs_total + neighbor->oneatom,
+                           "PairSnap::ghostneighs");
+              ghostneighs_max = ghostneighs_total + neighbor->oneatom;
+            }
+
+            int j = closestghost;
+
+            ghostilist[ghostinum] = j;
+            ghostnumneigh[j] = 0;
+            ghostfirstneigh[j] = ghostneighs_total;
+
+            ghostinum++;
+            int* jlist = ghostneighs + ghostfirstneigh[j];
+
+            for (int k = 0; k < atom->nlocal + atom->nghost; k++)
+              if (dist2(atom->x[j],atom->x[k]) <
+                  neighbor->cutneighmax*neighbor->cutneighmax) {
+                jlist[ghostnumneigh[j]] = k;
+                ghostnumneigh[j]++;
+                ghostneighs_total++;
+              }
+          }
+
+          if (get_tag >= 0) nrecv++;
+        }
+        if (nsend) nlocal--;
+      }
+}
+
+void PairNNSNAP::set_sna_to_shared(int snaid,int i)
+{
+  sna[snaid]->rij = i_rij[i];
+  sna[snaid]->inside = i_inside[i];
+  sna[snaid]->wj = i_wj[i];
+  sna[snaid]->rcutij = i_rcutij[i];
+  sna[snaid]->zarray_r = i_zarray_r[i];
+  sna[snaid]->zarray_i = i_zarray_i[i];
+  sna[snaid]->uarraytot_r = i_uarraytot_r[i];
+  sna[snaid]->uarraytot_i = i_uarraytot_i[i];
+}
+
+void PairNNSNAP::build_per_atom_arrays()
+{
+
+#ifdef TIMING_INFO
+  clock_gettime(CLOCK_REALTIME,&starttime);
+#endif
+
+  int count = 0;
+  int neighmax = 0;
+  for (int ii = 0; ii < list->inum; ii++)
+    if ((do_load_balance <= 0) || ilistmask[ii]) {
+      neighmax=MAX(neighmax,list->numneigh[list->ilist[ii]]);
+      ++count;
+    }
+  for (int ii = 0; ii < ghostinum; ii++) {
+    neighmax=MAX(neighmax,ghostnumneigh[ghostilist[ii]]);
+    ++count;
+  }
+
+  if (i_max < count || i_neighmax < neighmax) {
+    int i_maxt = MAX(count,i_max);
+    i_neighmax = MAX(neighmax,i_neighmax);
+    memory->destroy(i_rij);
+    memory->destroy(i_inside);
+    memory->destroy(i_wj);
+    memory->destroy(i_rcutij);
+    memory->destroy(i_ninside);
+    memory->destroy(i_pairs);
+    memory->create(i_rij,i_maxt,i_neighmax,3,"PairNNSNAP::i_rij");
+    memory->create(i_inside,i_maxt,i_neighmax,"PairNNSNAP::i_inside");
+    memory->create(i_wj,i_maxt,i_neighmax,"PairNNSNAP::i_wj");
+    memory->create(i_rcutij,i_maxt,i_neighmax,"PairNNSNAP::i_rcutij");
+    memory->create(i_ninside,i_maxt,"PairNNSNAP::i_ninside");
+    memory->create(i_pairs,i_maxt*i_neighmax,4,"PairNNSNAP::i_pairs");
+  }
+
+  if (i_max < count) {
+    int jdim = sna[0]->twojmax+1;
+    memory->destroy(i_uarraytot_r);
+    memory->destroy(i_uarraytot_i);
+    memory->create(i_uarraytot_r,count,jdim,jdim,jdim,
+                   "PairNNSNAP::i_uarraytot_r");
+    memory->create(i_uarraytot_i,count,jdim,jdim,jdim,
+                   "PairNNSNAP::i_uarraytot_i");
+    if (i_zarray_r != NULL)
+      for (int i = 0; i < i_max; i++) {
+        memory->destroy(i_zarray_r[i]);
+        memory->destroy(i_zarray_i[i]);
+      }
+
+    delete [] i_zarray_r;
+    delete [] i_zarray_i;
+    i_zarray_r = new double*****[count];
+    i_zarray_i = new double*****[count];
+    for (int i = 0; i < count; i++) {
+      memory->create(i_zarray_r[i],jdim,jdim,jdim,jdim,jdim,
+                     "PairNNSNAP::i_zarray_r");
+      memory->create(i_zarray_i[i],jdim,jdim,jdim,jdim,jdim,
+                     "PairNNSNAP::i_zarray_i");
+    }
+  }
+
+  if (i_max < count)
+    i_max = count;
+
+  count = 0;
+  i_numpairs = 0;
+  for (int ii = 0; ii < list->inum; ii++) {
+    if ((do_load_balance <= 0) || ilistmask[ii]) {
+      int i = list->ilist[ii];
+      int jnum = list->numneigh[i];
+      int* jlist = list->firstneigh[i];
+      const double xtmp = atom->x[i][0];
+      const double ytmp = atom->x[i][1];
+      const double ztmp = atom->x[i][2];
+      const int itype = atom->type[i];
+      const int ielem = map[itype];
+      const double radi = radelem[ielem];
+      int ninside = 0;
+      for (int jj = 0; jj < jnum; jj++) {
+        int j = jlist[jj];
+        j &= NEIGHMASK;
+        const double delx = atom->x[j][0] - xtmp;
+        const double dely = atom->x[j][1] - ytmp;
+        const double delz = atom->x[j][2] - ztmp;
+        const double rsq = delx*delx + dely*dely + delz*delz;
+        int jtype = atom->type[j];
+        int jelem = map[jtype];
+
+        i_pairs[i_numpairs][0] = i;
+        i_pairs[i_numpairs][1] = jj;
+        i_pairs[i_numpairs][2] = -1;
+        i_pairs[i_numpairs][3] = count;
+        if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
+          i_rij[count][ninside][0] = delx;
+          i_rij[count][ninside][1] = dely;
+          i_rij[count][ninside][2] = delz;
+          i_inside[count][ninside] = j;
+          i_wj[count][ninside] = wjelem[jelem];
+          i_rcutij[count][ninside] = (radi + radelem[jelem])*rcutfac;
+
+          // update index list with inside index
+          i_pairs[i_numpairs][2] = ninside++;
+        }
+        i_numpairs++;
+      }
+      i_ninside[count] = ninside;
+      count++;
+    }
+  }
+
+  for (int ii = 0; ii < ghostinum; ii++) {
+    int i = ghostilist[ii];
+    int jnum = ghostnumneigh[i];
+    int* jlist = ghostneighs+ghostfirstneigh[i];
+    const double xtmp = atom->x[i][0];
+    const double ytmp = atom->x[i][1];
+    const double ztmp = atom->x[i][2];
+    const int itype = atom->type[i];
+    const int ielem = map[itype];
+    const double radi = radelem[ielem];
+    int ninside = 0;
+
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = jlist[jj];
+      j &= NEIGHMASK;
+      const double delx = atom->x[j][0] - xtmp;
+      const double dely = atom->x[j][1] - ytmp;
+      const double delz = atom->x[j][2] - ztmp;
+      const double rsq = delx*delx + dely*dely + delz*delz;
+      int jtype = atom->type[j];
+      int jelem = map[jtype];
+
+      i_pairs[i_numpairs][0] = i;
+      i_pairs[i_numpairs][1] = jj;
+      i_pairs[i_numpairs][2] = -1;
+      i_pairs[i_numpairs][3] = count;
+      if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
+        i_rij[count][ninside][0] = delx;
+        i_rij[count][ninside][1] = dely;
+        i_rij[count][ninside][2] = delz;
+        i_inside[count][ninside] = j;
+        i_wj[count][ninside] = wjelem[jelem];
+        i_rcutij[count][ninside] = (radi + radelem[jelem])*rcutfac;
+        // update index list with inside index
+        i_pairs[i_numpairs][2] = ninside++;
+      }
+      i_numpairs++;
+    }
+    i_ninside[count] = ninside;
+    count++;
+  }
+#ifdef TIMING_INFO
+  clock_gettime(CLOCK_REALTIME,&endtime);
+  timers[0]+=(endtime.tv_sec-starttime.tv_sec+1.0*
+              (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+#endif
+#ifdef TIMING_INFO
+  clock_gettime(CLOCK_REALTIME,&starttime);
+#endif
+
+#if defined(_OPENMP)
+#pragma omp parallel for shared(count) default(none)
+#endif
+  for (int ii=0; ii < count; ii++) {
+    int tid = omp_get_thread_num();
+    set_sna_to_shared(tid,ii);
+    //sna[tid]->compute_ui(i_ninside[ii]);
+#ifdef TIMING_INFO
+    clock_gettime(CLOCK_REALTIME,&starttime);
+#endif
+    sna[tid]->compute_ui_omp(i_ninside[ii],MAX(int(nthreads/count),1));
+#ifdef TIMING_INFO
+    clock_gettime(CLOCK_REALTIME,&endtime);
+    sna[tid]->timers[0]+=(endtime.tv_sec-starttime.tv_sec+1.0*
+                          (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+#endif
+  }
+
+#ifdef TIMING_INFO
+  clock_gettime(CLOCK_REALTIME,&starttime);
+#endif
+  for (int ii=0; ii < count; ii++) {
+    int tid = 0;//omp_get_thread_num();
+    set_sna_to_shared(tid,ii);
+    sna[tid]->compute_zi_omp(MAX(int(nthreads/count),1));
+  }
+#ifdef TIMING_INFO
+  clock_gettime(CLOCK_REALTIME,&endtime);
+  sna[0]->timers[1]+=(endtime.tv_sec-starttime.tv_sec+1.0*
+                      (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+#endif
+
+#ifdef TIMING_INFO
+  clock_gettime(CLOCK_REALTIME,&endtime);
+  timers[1]+=(endtime.tv_sec-starttime.tv_sec+1.0*
+              (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
+#endif
+}
+
+/* ----------------------------------------------------------------------
+   compute beta
+------------------------------------------------------------------------- */
+
+void PairNNSNAP::compute_beta()
+{
+  int i;
+  int *type = atom->type;
+
+  for (int ii = 0; ii < list->inum; ii++) {
+    i = list->ilist[ii];
+    const int itype = type[i];
+    const int ielem = map[itype];
+    double* coeffi = coeffelem[ielem];
+
+    for (int k = 1; k <= ncoeff; k++)
+      beta[ii][k-1] = coeffi[k];
+  }
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+void PairNNSNAP::allocate()
+{
+  allocated = 1;
+  int n = atom->ntypes;
+
+  memory->create(setflag,n+1,n+1,"pair:setflag");
+  memory->create(cutsq,n+1,n+1,"pair:cutsq");
+  memory->create(map,n+1,"pair:map");
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairNNSNAP::settings(int narg, char **arg)
+{
+
+  // set default values for optional arguments
+
+  nthreads = -1;
+  use_shared_arrays=-1;
+  do_load_balance = 0;
+  use_optimized = 1;
+
+  // optional arguments
+
+  for (int i=0; i < narg; i++) {
+    if (i+2>narg) error->all(FLERR,"Illegal pair_style command");
+    if (strcmp(arg[i],"nthreads")==0) {
+      nthreads=force->inumeric(FLERR,arg[++i]);
+#if defined(LMP_USER_OMP)
+      error->all(FLERR,"Must set number of threads via package omp command");
+#else
+      omp_set_num_threads(nthreads);
+      comm->nthreads=nthreads;
+#endif
+      continue;
+    }
+    if (strcmp(arg[i],"optimized")==0) {
+      use_optimized=force->inumeric(FLERR,arg[++i]);
+      continue;
+    }
+    if (strcmp(arg[i],"shared")==0) {
+      use_shared_arrays=force->inumeric(FLERR,arg[++i]);
+      continue;
+    }
+    if (strcmp(arg[i],"loadbalance")==0) {
+      do_load_balance = force->inumeric(FLERR,arg[++i]);
+      if (do_load_balance) {
+        double mincutoff = extra_cutoff() +
+          rcutmax + neighbor->skin;
+        if (comm->cutghostuser < mincutoff) {
+          char buffer[255];
+
+          //apparently mincutoff is 0 after sprintf command ?????
+
+          double tmp = mincutoff + 0.1;
+          sprintf(buffer, "Communication cutoff is too small "
+                  "for SNAP micro load balancing, increased to %lf",
+                  mincutoff+0.1);
+          if (comm->me==0)
+            error->warning(FLERR,buffer);
+
+          comm->cutghostuser = tmp;
+
+        }
+      }
+      continue;
+    }
+    if (strcmp(arg[i],"schedule")==0) {
+      i++;
+      if (strcmp(arg[i],"static")==0)
+        schedule_user = 1;
+      if (strcmp(arg[i],"dynamic")==0)
+        schedule_user = 2;
+      if (strcmp(arg[i],"guided")==0)
+        schedule_user = 3;
+      if (strcmp(arg[i],"auto")==0)
+        schedule_user = 4;
+      if (strcmp(arg[i],"determine")==0)
+        schedule_user = 5;
+      if (schedule_user == 0)
+        error->all(FLERR,"Illegal pair_style command");
+      continue;
+    }
+    error->all(FLERR,"Illegal pair_style command");
+  }
+
+  if (nthreads < 0)
+    nthreads = comm->nthreads;
+
+  if (use_shared_arrays < 0) {
+    if (nthreads > 1 && atom->nlocal <= 2*nthreads)
+      use_shared_arrays = 1;
+    else use_shared_arrays = 0;
+  }
+
+  // check if running non-optimized code with
+  // optimization flags set
+
+  if (!use_optimized)
+    if (nthreads > 1 ||
+        use_shared_arrays ||
+        do_load_balance ||
+        schedule_user)
+      error->all(FLERR,"Illegal pair_style command");
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+void PairNNSNAP::coeff(int narg, char **arg)
+{
+  if (narg < 5) error->all(FLERR,"Incorrect args for pair coefficients");
+  if (!allocated) allocate();
+
+  if (nelements) {
+    for (int i = 0; i < nelements; i++)
+      delete[] elements[i];
+    delete[] elements;
+    memory->destroy(radelem);
+    memory->destroy(wjelem);
+    memory->destroy(coeffelem);
+    memory->destroy(beta);
+  }
+
+  char* type1 = arg[0];
+  char* type2 = arg[1];
+  char* coefffilename = arg[2];
+  char* paramfilename = arg[3];
+  char** elemtypes = &arg[4];
+
+  // insure I,J args are * *
+
+  if (strcmp(type1,"*") != 0 || strcmp(type2,"*") != 0)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // read snapcoeff and snapparam files
+
+  read_files(coefffilename,paramfilename);
+
+  if (!quadraticflag)
+    ncoeff = ncoeffall - 1;
+  else {
+
+    // ncoeffall should be (ncoeff+2)*(ncoeff+1)/2
+    // so, ncoeff = floor(sqrt(2*ncoeffall))-1
+
+    ncoeff = sqrt(2*ncoeffall)-1;
+    ncoeffq = (ncoeff*(ncoeff+1))/2;
+    int ntmp = 1+ncoeff+ncoeffq;
+    if (ntmp != ncoeffall) {
+      printf("ncoeffall = %d ntmp = %d ncoeff = %d \n",ncoeffall,ntmp,ncoeff);
+      error->all(FLERR,"Incorrect SNAP coeff file");
+    }
+  }
+
+  // read args that map atom types to SNAP elements
+  // map[i] = which element the Ith atom type is, -1 if not mapped
+  // map[0] is not used
+
+  for (int i = 1; i <= atom->ntypes; i++) {
+    char* elemname = elemtypes[i-1];
+    int jelem;
+    for (jelem = 0; jelem < nelements; jelem++)
+      if (strcmp(elemname,elements[jelem]) == 0)
+        break;
+
+    if (jelem < nelements)
+      map[i] = jelem;
+    else if (strcmp(elemname,"NULL") == 0) map[i] = -1;
+    else error->all(FLERR,"Incorrect args for pair coefficients");
+  }
+
+  // clear setflag since coeff() called once with I,J = * *
+
+  int n = atom->ntypes;
+  for (int i = 1; i <= n; i++)
+    for (int j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  // set setflag i,j for type pairs where both are mapped to elements
+
+  int count = 0;
+  for (int i = 1; i <= n; i++)
+    for (int j = i; j <= n; j++)
+      if (map[i] >= 0 && map[j] >= 0) {
+        setflag[i][j] = 1;
+        count++;
+      }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+
+  sna = new SNA*[nthreads];
+
+  // allocate memory for per OpenMP thread data which
+  // is wrapped into the sna class
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    int tid = omp_get_thread_num();
+    sna[tid] = new SNA(lmp,rfac0,twojmax,
+                       diagonalstyle,use_shared_arrays,
+                       rmin0,switchflag,bzeroflag);
+    if (!use_shared_arrays)
+      sna[tid]->grow_rij(nmax);
+  }
+
+  if (ncoeff != sna[0]->ncoeff) {
+    if (comm->me == 0)
+      printf("ncoeff = %d snancoeff = %d \n",ncoeff,sna[0]->ncoeff);
+    error->all(FLERR,"Incorrect SNAP parameter file");
+  }
+
+  // Calculate maximum cutoff for all elements
+
+  rcutmax = 0.0;
+  for (int ielem = 0; ielem < nelements; ielem++)
+    rcutmax = MAX(2.0*radelem[ielem]*rcutfac,rcutmax);
+
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairNNSNAP::init_style()
+{
+  if (force->newton_pair == 0)
+    error->all(FLERR,"Pair style SNAP requires newton pair on");
+
+  // need a full neighbor list
+
+  int irequest = neighbor->request(this,instance_me);
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    int tid = omp_get_thread_num();
+    sna[tid]->init();
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+double PairNNSNAP::init_one(int i, int j)
+{
+  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
+  return (radelem[map[i]] +
+          radelem[map[j]])*rcutfac;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairNNSNAP::read_files(char *coefffilename, char *paramfilename)
+{
+
+  // open SNAP coefficient file on proc 0
+
+  FILE *fpcoeff;
+  if (comm->me == 0) {
+    fpcoeff = force->open_potential(coefffilename);
+    if (fpcoeff == NULL) {
+      char str[128];
+      snprintf(str,128,"Cannot open SNAP coefficient file %s",coefffilename);
+      error->one(FLERR,str);
+    }
+  }
+
+  char line[MAXLINE],*ptr;
+  int eof = 0;
+
+  int n;
+  int nwords = 0;
+  while (nwords == 0) {
+    if (comm->me == 0) {
+      ptr = fgets(line,MAXLINE,fpcoeff);
+      if (ptr == NULL) {
+        eof = 1;
+        fclose(fpcoeff);
+      } else n = strlen(line) + 1;
+    }
+    MPI_Bcast(&eof,1,MPI_INT,0,world);
+    if (eof) break;
+    MPI_Bcast(&n,1,MPI_INT,0,world);
+    MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+    // strip comment, skip line if blank
+
+    if ((ptr = strchr(line,'#'))) *ptr = '\0';
+    nwords = atom->count_words(line);
+  }
+  if (nwords != 2)
+    error->all(FLERR,"Incorrect format in SNAP coefficient file");
+
+  // words = ptrs to all words in line
+  // strip single and double quotes from words
+
+  char* words[MAXWORD];
+  int iword = 0;
+  words[iword] = strtok(line,"' \t\n\r\f");
+  iword = 1;
+  words[iword] = strtok(NULL,"' \t\n\r\f");
+
+  nelements = atoi(words[0]);
+  ncoeffall = atoi(words[1]);
+
+  // set up element lists
+
+  elements = new char*[nelements];
+  memory->create(radelem,nelements,"pair:radelem");
+  memory->create(wjelem,nelements,"pair:wjelem");
+  memory->create(coeffelem,nelements,ncoeffall,"pair:coeffelem");
+
+  // Loop over nelements blocks in the SNAP coefficient file
+
+  for (int ielem = 0; ielem < nelements; ielem++) {
+
+    if (comm->me == 0) {
+      ptr = fgets(line,MAXLINE,fpcoeff);
+      if (ptr == NULL) {
+        eof = 1;
+        fclose(fpcoeff);
+      } else n = strlen(line) + 1;
+    }
+    MPI_Bcast(&eof,1,MPI_INT,0,world);
+    if (eof)
+      error->all(FLERR,"Incorrect format in SNAP coefficient file");
+    MPI_Bcast(&n,1,MPI_INT,0,world);
+    MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+    nwords = atom->count_words(line);
+    if (nwords != 3)
+      error->all(FLERR,"Incorrect format in SNAP coefficient file");
+
+    iword = 0;
+    words[iword] = strtok(line,"' \t\n\r\f");
+    iword = 1;
+    words[iword] = strtok(NULL,"' \t\n\r\f");
+    iword = 2;
+    words[iword] = strtok(NULL,"' \t\n\r\f");
+
+    char* elemtmp = words[0];
+    int n = strlen(elemtmp) + 1;
+    elements[ielem] = new char[n];
+    strcpy(elements[ielem],elemtmp);
+
+    radelem[ielem] = atof(words[1]);
+    wjelem[ielem] = atof(words[2]);
+
+
+    if (comm->me == 0) {
+      if (screen) fprintf(screen,"SNAP Element = %s, Radius %g, Weight %g \n",
+                          elements[ielem], radelem[ielem], wjelem[ielem]);
+      if (logfile) fprintf(logfile,"SNAP Element = %s, Radius %g, Weight %g \n",
+                          elements[ielem], radelem[ielem], wjelem[ielem]);
+    }
+
+    for (int icoeff = 0; icoeff < ncoeffall; icoeff++) {
+      if (comm->me == 0) {
+        ptr = fgets(line,MAXLINE,fpcoeff);
+        if (ptr == NULL) {
+          eof = 1;
+          fclose(fpcoeff);
+        } else n = strlen(line) + 1;
+      }
+
+      MPI_Bcast(&eof,1,MPI_INT,0,world);
+      if (eof)
+        error->all(FLERR,"Incorrect format in SNAP coefficient file");
+      MPI_Bcast(&n,1,MPI_INT,0,world);
+      MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+      nwords = atom->count_words(line);
+      if (nwords != 1)
+        error->all(FLERR,"Incorrect format in SNAP coefficient file");
+
+      iword = 0;
+      words[iword] = strtok(line,"' \t\n\r\f");
+
+      coeffelem[ielem][icoeff] = atof(words[0]);
+
+    }
+  }
+
+  // set flags for required keywords
+
+  rcutfacflag = 0;
+  twojmaxflag = 0;
+
+  // Set defaults for optional keywords
+
+  rfac0 = 0.99363;
+  rmin0 = 0.0;
+  diagonalstyle = 3;
+  switchflag = 1;
+  bzeroflag = 1;
+  quadraticflag = 0;
+
+  // open SNAP parameter file on proc 0
+
+  FILE *fpparam;
+  if (comm->me == 0) {
+    fpparam = force->open_potential(paramfilename);
+    if (fpparam == NULL) {
+      char str[128];
+      snprintf(str,128,"Cannot open SNAP parameter file %s",paramfilename);
+      error->one(FLERR,str);
+    }
+  }
+
+  eof = 0;
+  while (1) {
+    if (comm->me == 0) {
+      ptr = fgets(line,MAXLINE,fpparam);
+      if (ptr == NULL) {
+        eof = 1;
+        fclose(fpparam);
+      } else n = strlen(line) + 1;
+    }
+    MPI_Bcast(&eof,1,MPI_INT,0,world);
+    if (eof) break;
+    MPI_Bcast(&n,1,MPI_INT,0,world);
+    MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+    // strip comment, skip line if blank
+
+    if ((ptr = strchr(line,'#'))) *ptr = '\0';
+    nwords = atom->count_words(line);
+    if (nwords == 0) continue;
+
+    if (nwords != 2)
+      error->all(FLERR,"Incorrect format in SNAP parameter file");
+
+    // words = ptrs to all words in line
+    // strip single and double quotes from words
+
+    char* keywd = strtok(line,"' \t\n\r\f");
+    char* keyval = strtok(NULL,"' \t\n\r\f");
+
+    if (comm->me == 0) {
+      if (screen) fprintf(screen,"SNAP keyword %s %s \n",keywd,keyval);
+      if (logfile) fprintf(logfile,"SNAP keyword %s %s \n",keywd,keyval);
+    }
+
+    if (strcmp(keywd,"rcutfac") == 0) {
+      rcutfac = atof(keyval);
+      rcutfacflag = 1;
+    } else if (strcmp(keywd,"twojmax") == 0) {
+      twojmax = atoi(keyval);
+      twojmaxflag = 1;
+    } else if (strcmp(keywd,"rfac0") == 0)
+      rfac0 = atof(keyval);
+    else if (strcmp(keywd,"rmin0") == 0)
+      rmin0 = atof(keyval);
+    else if (strcmp(keywd,"diagonalstyle") == 0)
+      diagonalstyle = atoi(keyval);
+    else if (strcmp(keywd,"switchflag") == 0)
+      switchflag = atoi(keyval);
+    else if (strcmp(keywd,"bzeroflag") == 0)
+      bzeroflag = atoi(keyval);
+    else if (strcmp(keywd,"quadraticflag") == 0)
+      quadraticflag = atoi(keyval);
+    else
+      error->all(FLERR,"Incorrect SNAP parameter file");
+  }
+
+  if (rcutfacflag == 0 || twojmaxflag == 0)
+    error->all(FLERR,"Incorrect SNAP parameter file");
+
+}
+
+/* ----------------------------------------------------------------------
+   memory usage
+------------------------------------------------------------------------- */
+
+double PairNNSNAP::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  int n = atom->ntypes+1;
+  bytes += n*n*sizeof(int);
+  bytes += n*n*sizeof(double);
+  bytes += 3*nmax*sizeof(double);
+  bytes += nmax*sizeof(int);
+  bytes += (2*ncoeffall)*sizeof(double);
+  bytes += (ncoeff*3)*sizeof(double);
+  bytes += sna[0]->memory_usage()*nthreads;
+  return bytes;
+}
+
diff --git a/src/SNAP/pair_nn_snap.h b/src/SNAP/pair_nn_snap.h
new file mode 100644
index 0000000000..f77ddee207
--- /dev/null
+++ b/src/SNAP/pair_nn_snap.h
@@ -0,0 +1,184 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(nn/snap,PairNNSNAP)
+
+#else
+
+#ifndef LMP_PAIR_NN_SNAP_H
+#define LMP_PAIR_NN_SNAP_H
+
+#include "pair.h"
+
+namespace LAMMPS_NS {
+
+class PairNNSNAP : public Pair {
+public:
+  PairNNSNAP(class LAMMPS *);
+  ~PairNNSNAP();
+  virtual void compute(int, int);
+  void compute_regular(int, int);
+  void compute_optimized(int, int);
+  void settings(int, char **);
+  virtual void coeff(int, char **);
+  virtual void init_style();
+  virtual double init_one(int, int);
+  virtual double memory_usage();
+
+  double rcutfac, quadraticflag; // declared public to workaround gcc 4.9
+  int ncoeff;                    //  compiler bug, manifest in KOKKOS package
+
+protected:
+  int ncoeffq, ncoeffall;
+  double **bvec, ***dbvec;
+  class SNA** sna;
+  int nmax;
+  int nthreads;
+  virtual void allocate();
+  void read_files(char *, char *);
+  inline int equal(double* x,double* y);
+  inline double dist2(double* x,double* y);
+  double extra_cutoff();
+  void load_balance();
+  void set_sna_to_shared(int snaid,int i);
+  void build_per_atom_arrays();
+
+  void compute_beta();
+
+  int schedule_user;
+  double schedule_time_guided;
+  double schedule_time_dynamic;
+
+  int ncalls_neigh;
+  int do_load_balance;
+  int ilistmask_max;
+  int* ilistmask;
+  int ghostinum;
+  int ghostilist_max;
+  int* ghostilist;
+  int ghostnumneigh_max;
+  int* ghostnumneigh;
+  int* ghostneighs;
+  int* ghostfirstneigh;
+  int ghostneighs_total;
+  int ghostneighs_max;
+
+  int use_optimized;
+  int use_shared_arrays;
+
+  int i_max;
+  int i_neighmax;
+  int i_numpairs;
+  int **i_pairs;
+  double ***i_rij;
+  int **i_inside;
+  double **i_wj;
+  double **i_rcutij;
+  int *i_ninside;
+  double ****i_uarraytot_r, ****i_uarraytot_i;
+  double ******i_zarray_r, ******i_zarray_i;
+
+#ifdef TIMING_INFO
+  //  timespec starttime, endtime;
+  double timers[4];
+#endif
+
+  double rcutmax;               // max cutoff for all elements
+  int nelements;                // # of unique elements
+  char **elements;              // names of unique elements
+  double *radelem;              // element radii
+  double *wjelem;               // elements weights
+  double **coeffelem;           // element bispectrum coefficients
+  double** beta;                // betas for all atoms in list
+  int *map;                     // mapping from atom types to elements
+  int twojmax, diagonalstyle, switchflag, bzeroflag;
+  double rfac0, rmin0, wj1, wj2;
+  int rcutfacflag, twojmaxflag; // flags for required parameters
+  int beta_max;                 // length of beta
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Communication cutoff too small for SNAP micro load balancing
+
+This can happen if you change the neighbor skin after your pair_style
+command or if your box dimensions grow during a run. You can set the
+cutoff explicitly via the comm_modify cutoff command.
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Must set number of threads via package omp command
+
+Because you are using the USER-OMP package, set the number of threads
+via its settings, not by the pair_style snap nthreads setting.
+
+W: Communication cutoff is too small for SNAP micro load balancing, increased to %lf
+
+Self-explanatory.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Incorrect SNAP coeff file
+
+UNDOCUMENTED
+
+E: Incorrect SNAP parameter file
+
+The file cannot be parsed correctly, check its internal syntax.
+
+E: Pair style SNAP requires newton pair on
+
+See the newton command.  This is a restriction to use the SNAP
+potential.
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+E: Cannot open SNAP coefficient file %s
+
+The specified SNAP coefficient file cannot be opened.  Check that the
+path and name are correct.
+
+E: Incorrect format in SNAP coefficient file
+
+Incorrect number of words per line in the coefficient file.
+
+E: Cannot open SNAP parameter file %s
+
+The specified SNAP parameter file cannot be opened.  Check that the
+path and name are correct.
+
+E: Incorrect format in SNAP parameter file
+
+Incorrect number of words per line in the parameter file.
+
+E: Did not find all elements in SNAP coefficient file.
+
+One or more elements listed in the pair_coeff command were not found in the coefficient file.
+
+*/

From f2d881470d0edfbe68adc12204ddd1f44621d069 Mon Sep 17 00:00:00 2001
From: Aidan Thompson <athomps@sandia.gov>
Date: Sat, 11 May 2019 12:55:11 -0600
Subject: [PATCH 05/21] Added placeholder for neural network SNAP potential

---
 examples/snap/W.nnsnap  | 16 +++++++++++++++
 examples/snap/in.nnsnap | 45 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)
 create mode 100644 examples/snap/W.nnsnap
 create mode 100644 examples/snap/in.nnsnap

diff --git a/examples/snap/W.nnsnap b/examples/snap/W.nnsnap
new file mode 100644
index 0000000000..93c2bf866a
--- /dev/null
+++ b/examples/snap/W.nnsnap
@@ -0,0 +1,16 @@
+# DATE: 2017-02-20 CONTRIBUTOR: Mitchell Wood mitwood@sandia.gov CITATION: Wood, M. A. and Thompson, A. P. "Quantum-Accurate Molecular Dynamics Potential for Tungsten" arXiv:1702.07042 [physics.comp-ph]
+#
+# Definition of SNAP+ZBL potential.
+variable zblcutinner equal 4
+variable zblcutouter equal 4.8
+variable zblz equal 74
+
+# Specify hybrid with SNAP and ZBL
+
+pair_style hybrid/overlay &
+zbl ${zblcutinner} ${zblcutouter} &
+snap
+pair_coeff 1 1 zbl ${zblz} ${zblz}
+pair_coeff * * snap W_2940_2017_2.snapcoeff W_2940_2017_2.snapparam W
+
+#Nomenclature on the snap files are Element_DakotaID_Year_Month
diff --git a/examples/snap/in.nnsnap b/examples/snap/in.nnsnap
new file mode 100644
index 0000000000..d575757d56
--- /dev/null
+++ b/examples/snap/in.nnsnap
@@ -0,0 +1,45 @@
+# Demonstrate SNAP Ta potential
+
+# Initialize simulation
+
+variable nsteps index 100
+variable nrep equal 4
+variable a equal 3.1803
+units		metal
+
+# generate the box and atom positions using a BCC lattice
+
+variable nx equal ${nrep}
+variable ny equal ${nrep}
+variable nz equal ${nrep}
+
+boundary	p p p
+
+lattice         bcc $a
+region		box block 0 ${nx} 0 ${ny} 0 ${nz}
+create_box	1 box
+create_atoms	1 box
+
+mass 1 183.84
+
+# choose potential
+
+include W.nnsnap
+
+# Setup output
+
+thermo		10
+thermo_modify norm yes
+
+# Set up NVE run
+
+timestep 0.5e-3
+neighbor 1.0 bin
+neigh_modify once no every 1 delay 0 check yes
+
+# Run MD
+
+velocity all create 300.0 4928459
+fix 1 all nve
+run             ${nsteps}
+

From a0cc6b5b59a41b24f646d218551cac1ff47b7566 Mon Sep 17 00:00:00 2001
From: Aidan Thompson <athomps@sandia.gov>
Date: Sat, 11 May 2019 14:04:21 -0600
Subject: [PATCH 06/21] Forgot to change pair style to nn/snap

---
 examples/snap/W.nnsnap | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/snap/W.nnsnap b/examples/snap/W.nnsnap
index 93c2bf866a..6ca97a701a 100644
--- a/examples/snap/W.nnsnap
+++ b/examples/snap/W.nnsnap
@@ -9,8 +9,8 @@ variable zblz equal 74
 
 pair_style hybrid/overlay &
 zbl ${zblcutinner} ${zblcutouter} &
-snap
+nn/snap
 pair_coeff 1 1 zbl ${zblz} ${zblz}
-pair_coeff * * snap W_2940_2017_2.snapcoeff W_2940_2017_2.snapparam W
+pair_coeff * * nn/snap W_2940_2017_2.snapcoeff W_2940_2017_2.snapparam W
 
 #Nomenclature on the snap files are Element_DakotaID_Year_Month

From 98d9c45ad97be8e2f26a7310d2f0627429dc4a43 Mon Sep 17 00:00:00 2001
From: Aidan Thompson <athomps@sandia.gov>
Date: Wed, 15 May 2019 17:18:24 -0600
Subject: [PATCH 07/21] compute_bispectrum

---
 src/SNAP/pair_snap.cpp | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp
index 0bf367b5dc..4913044369 100644
--- a/src/SNAP/pair_snap.cpp
+++ b/src/SNAP/pair_snap.cpp
@@ -1319,6 +1319,28 @@ void PairSNAP::compute_beta()
   }
 }
 
+/* ----------------------------------------------------------------------
+   compute bispectrum
+------------------------------------------------------------------------- */
+
+void PairSNAP::compute_bispectrum()
+{
+  int i;
+  int *type = atom->type;
+
+  for (int ii = 0; ii < list->inum; ii++) {
+    i = list->ilist[ii];
+    const int itype = type[i];
+    const int ielem = map[itype];
+    double* coeffi = coeffelem[ielem];
+    snaptr->compute_bi();
+    snaptr->copy_bi2bvec();
+
+    for (int k = 0; k < ncoeff; k++)
+      bispectrum[ii][k] = snaptr->bvec[k];
+  }
+}
+
 /* ----------------------------------------------------------------------
    allocate all arrays
 ------------------------------------------------------------------------- */

From 5b71b3fc57f8cc9bd935202f35db0538006e37e9 Mon Sep 17 00:00:00 2001
From: Aidan Thompson <athomps@sandia.gov>
Date: Thu, 16 May 2019 21:51:24 -0600
Subject: [PATCH 08/21] Added bispectrum compute

---
 src/SNAP/pair_snap.cpp | 480 +++++------------------------------------
 src/SNAP/pair_snap.h   |   2 +
 2 files changed, 56 insertions(+), 426 deletions(-)

diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp
index 4913044369..f3b678971d 100644
--- a/src/SNAP/pair_snap.cpp
+++ b/src/SNAP/pair_snap.cpp
@@ -192,11 +192,13 @@ void PairSNAP::compute_regular(int eflag, int vflag)
 
   if (beta_max < list->inum) {
     memory->grow(beta,list->inum,ncoeff,"PairSNAP:beta");
+    memory->grow(bispectrum,list->inum,ncoeff,"PairSNAP:bispectrum");
     beta_max = list->inum;
   }
 
   // compute dE_i/dB_i = beta_i for all i in list
 
+  compute_bispectrum();
   compute_beta();
 
   numneigh = list->numneigh;
@@ -251,10 +253,6 @@ void PairSNAP::compute_regular(int eflag, int vflag)
 
     snaptr->compute_ui(ninside);
     snaptr->compute_zi();
-    if (quadraticflag) {
-      snaptr->compute_bi();
-      snaptr->copy_bi2bvec();
-    }
 
     // for neighbors of I within cutoff:
     // compute Fij = dEi/dRj = -dEi/dRi 
@@ -269,31 +267,6 @@ void PairSNAP::compute_regular(int eflag, int vflag)
       snaptr->compute_duidrj(snaptr->rij[jj],
                              snaptr->wj[jj],snaptr->rcutij[jj]);
 
-//       // quadratic contributions
-
-//       if (quadraticflag) {
-//         int k = ncoeff+1;
-//         for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-//           double bveci = snaptr->bvec[icoeff];
-//           double fack = coeffi[k]*bveci;
-//           double* dbveci = snaptr->dbvec[icoeff];
-//           fij[0] += fack*dbveci[0];
-//           fij[1] += fack*dbveci[1];
-//           fij[2] += fack*dbveci[2];
-//           k++;
-//           for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-//             double facki = coeffi[k]*bveci;
-//             double fackj = coeffi[k]*snaptr->bvec[jcoeff];
-//             double* dbvecj = snaptr->dbvec[jcoeff];
-
-//             fij[0] += facki*dbvecj[0]+fackj*dbveci[0];
-//             fij[1] += facki*dbvecj[1]+fackj*dbveci[1];
-//             fij[2] += facki*dbvecj[2]+fackj*dbveci[2];
-//             k++;
-//           }
-//         }
-//       }
-
       snaptr->compute_deidrj(fij);
 
       f[i][0] += fij[0];
@@ -320,10 +293,6 @@ void PairSNAP::compute_regular(int eflag, int vflag)
 
       double* coeffi = coeffelem[ielem];
       evdwl = coeffi[0];
-      if (!quadraticflag) {
-        snaptr->compute_bi();
-        snaptr->copy_bi2bvec();
-      }
 
       // E = beta.B + 0.5*B^t.alpha.B
       // coeff[k] = beta[k-1] or
@@ -332,21 +301,9 @@ void PairSNAP::compute_regular(int eflag, int vflag)
 
       // linear contributions
 
-      for (int k = 1; k <= ncoeff; k++)
-        evdwl += coeffi[k]*snaptr->bvec[k-1];
+      for (int k = 0; k < ncoeff; k++)
+        evdwl += beta[ii][k]*bispectrum[ii][k];
 
-      // quadratic contributions
-
-      if (quadraticflag) {
-        int k = ncoeff+1;
-        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          double bveci = snaptr->bvec[icoeff];
-          evdwl += 0.5*coeffi[k++]*bveci*bveci;
-          for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-            evdwl += coeffi[k++]*bveci*snaptr->bvec[jcoeff];
-          }
-        }
-      }
       ev_tally_full(i,2.0*evdwl,0.0,0.0,0.0,0.0,0.0);
     }
 
@@ -355,383 +312,6 @@ void PairSNAP::compute_regular(int eflag, int vflag)
   if (vflag_fdotr) virial_fdotr_compute();
 }
 
-
-/* ----------------------------------------------------------------------
-   This version is optimized for threading, micro-load balancing
-   ---------------------------------------------------------------------- */
-
-void PairSNAP::compute_optimized(int eflag, int vflag)
-{
-  // if reneighboring took place do load_balance if requested
-  if (do_load_balance > 0 &&
-      (neighbor->ncalls != ncalls_neigh)) {
-    ghostinum = 0;
-    // reset local ghost neighbor lists
-    ncalls_neigh = neighbor->ncalls;
-    if (ilistmask_max < list->inum) {
-      memory->grow(ilistmask,list->inum,"PairSnap::ilistmask");
-      ilistmask_max = list->inum;
-    }
-    for (int i = 0; i < list->inum; i++)
-      ilistmask[i] = 1;
-
-    //multiple passes for loadbalancing
-    for (int i = 0; i < do_load_balance; i++)
-      load_balance();
-  }
-
-  int numpairs = 0;
-  for (int ii = 0; ii < list->inum; ii++) {
-    if ((do_load_balance <= 0) || ilistmask[ii]) {
-      int i = list->ilist[ii];
-      int jnum = list->numneigh[i];
-      numpairs += jnum;
-    }
-  }
-
-  if (do_load_balance)
-    for (int ii = 0; ii < ghostinum; ii++) {
-      int i = ghostilist[ii];
-      int jnum = ghostnumneigh[i];
-      numpairs += jnum;
-    }
-
-  // optimized schedule setting
-
-  int time_dynamic = 0;
-  int time_guided = 0;
-
-  if (schedule_user == 0) schedule_user = 4;
-
-  switch (schedule_user) {
-  case 1:
-    omp_set_schedule(omp_sched_static,1);
-    break;
-  case 2:
-    omp_set_schedule(omp_sched_dynamic,1);
-    break;
-  case 3:
-    omp_set_schedule(omp_sched_guided,2);
-    break;
-  case 4:
-    omp_set_schedule(omp_sched_auto,0);
-    break;
-  case 5:
-    if (numpairs < 8*nthreads) omp_set_schedule(omp_sched_dynamic,1);
-    else if (schedule_time_guided < 0.0) {
-      omp_set_schedule(omp_sched_guided,2);
-      if (!eflag && !vflag) time_guided = 1;
-    } else if (schedule_time_dynamic<0.0) {
-      omp_set_schedule(omp_sched_dynamic,1);
-      if (!eflag && !vflag) time_dynamic = 1;
-    } else if (schedule_time_guided<schedule_time_dynamic)
-      omp_set_schedule(omp_sched_guided,2);
-    else
-      omp_set_schedule(omp_sched_dynamic,1);
-    break;
-  }
-
-  if (use_shared_arrays)
-    build_per_atom_arrays();
-
-#if defined(_OPENMP)
-#pragma omp parallel shared(eflag,vflag,time_dynamic,time_guided) firstprivate(numpairs) default(none)
-#endif
-  {
-    // begin of pragma omp parallel
-
-    int tid = omp_get_thread_num();
-    int** pairs_tid_unique = NULL;
-
-    int** pairs;
-    if (use_shared_arrays) pairs = i_pairs;
-    else {
-      memory->create(pairs_tid_unique,numpairs,4,"numpairs");
-      pairs = pairs_tid_unique;
-    }
-
-    if (!use_shared_arrays) {
-      numpairs = 0;
-      for (int ii = 0; ii < list->inum; ii++) {
-        if ((do_load_balance <= 0) || ilistmask[ii]) {
-          int i = list->ilist[ii];
-          int jnum = list->numneigh[i];
-          for (int jj = 0; jj<jnum; jj++) {
-            pairs[numpairs][0] = i;
-            pairs[numpairs][1] = jj;
-            pairs[numpairs][2] = -1;
-            numpairs++;
-          }
-        }
-      }
-
-      for (int ii = 0; ii < ghostinum; ii++) {
-        int i = ghostilist[ii];
-        int jnum = ghostnumneigh[i];
-        for (int jj = 0; jj<jnum; jj++) {
-          pairs[numpairs][0] = i;
-          pairs[numpairs][1] = jj;
-          pairs[numpairs][2] = -1;
-          numpairs++;
-        }
-      }
-    }
-
-    int ielem;
-    int jj,k,jnum,jtype,ninside;
-    double delx,dely,delz,evdwl,rsq;
-    double fij[3];
-    int *jlist,*numneigh,**firstneigh;
-    evdwl = 0.0;
-
-#if defined(_OPENMP)
-#pragma omp master
-#endif
-    {
-      ev_init(eflag,vflag);
-    }
-
-#if defined(_OPENMP)
-#pragma omp barrier
-    { ; }
-#endif
-
-    double **x = atom->x;
-    double **f = atom->f;
-    int *type = atom->type;
-    int nlocal = atom->nlocal;
-    int newton_pair = force->newton_pair;
-
-    numneigh = list->numneigh;
-    firstneigh = list->firstneigh;
-
-#ifdef TIMING_INFO
-    // only update micro timers after setup
-    static int count=0;
-    if (count<2) {
-      sna[tid]->timers[0] = 0;
-      sna[tid]->timers[1] = 0;
-      sna[tid]->timers[2] = 0;
-      sna[tid]->timers[3] = 0;
-      sna[tid]->timers[4] = 0;
-    }
-    count++;
-#endif
-
-    // did thread start working on interactions of new atom
-    int iold = -1;
-
-    double starttime, endtime;
-    if (time_dynamic || time_guided)
-      starttime = MPI_Wtime();
-
-#if defined(_OPENMP)
-#pragma omp for schedule(runtime)
-#endif
-    for (int iijj = 0; iijj < numpairs; iijj++) {
-      int i = 0;
-      if (use_shared_arrays) {
-        i = i_pairs[iijj][0];
-        if (iold != i) {
-          set_sna_to_shared(tid,i_pairs[iijj][3]);
-          ielem = map[type[i]];
-        }
-        iold = i;
-      } else {
-        i = pairs[iijj][0];
-        if (iold != i) {
-          iold = i;
-          const double xtmp = x[i][0];
-          const double ytmp = x[i][1];
-          const double ztmp = x[i][2];
-          const int itype = type[i];
-          ielem = map[itype];
-          const double radi = radelem[ielem];
-
-          if (i < nlocal) {
-            jlist = firstneigh[i];
-            jnum = numneigh[i];
-          } else {
-            jlist = ghostneighs+ghostfirstneigh[i];
-            jnum = ghostnumneigh[i];
-          }
-
-          // insure rij, inside, wj, and rcutij are of size jnum
-
-          sna[tid]->grow_rij(jnum);
-
-          // rij[][3] = displacements between atom I and those neighbors
-          // inside = indices of neighbors of I within cutoff
-          // wj = weights of neighbors of I within cutoff
-          // rcutij = cutoffs of neighbors of I within cutoff
-          // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi
-
-          ninside = 0;
-          for (jj = 0; jj < jnum; jj++) {
-            int j = jlist[jj];
-            j &= NEIGHMASK;
-            delx = x[j][0] - xtmp; //unitialised
-            dely = x[j][1] - ytmp;
-            delz = x[j][2] - ztmp;
-            rsq = delx*delx + dely*dely + delz*delz;
-            jtype = type[j];
-            int jelem = map[jtype];
-
-            if (rsq < cutsq[itype][jtype]&&rsq>1e-20) { //unitialised
-              sna[tid]->rij[ninside][0] = delx;
-              sna[tid]->rij[ninside][1] = dely;
-              sna[tid]->rij[ninside][2] = delz;
-              sna[tid]->inside[ninside] = j;
-              sna[tid]->wj[ninside] = wjelem[jelem];
-              sna[tid]->rcutij[ninside] = (radi + radelem[jelem])*rcutfac;
-              ninside++;
-
-              // update index list with inside index
-              pairs[iijj + (jj - pairs[iijj][1])][2] =
-                ninside-1; //unitialised
-            }
-          }
-
-          // compute Ui and Zi for atom I
-
-          sna[tid]->compute_ui(ninside); //unitialised
-          sna[tid]->compute_zi();
-        }
-      }
-      if (quadraticflag) {
-        sna[tid]->compute_bi();
-        sna[tid]->copy_bi2bvec();
-      }
-
-      // for neighbors of I within cutoff:
-      // compute dUi/drj and dBi/drj
-      // Fij = dEi/dRj = -dEi/dRi => add to Fi, subtract from Fj
-
-      // entry into loop if inside index is set
-
-      double* coeffi = coeffelem[ielem];
-
-      if (pairs[iijj][2] >= 0) {
-        jj = pairs[iijj][2];
-        int j = sna[tid]->inside[jj];
-        sna[tid]->compute_duidrj(sna[tid]->rij[jj],
-                                 sna[tid]->wj[jj],sna[tid]->rcutij[jj]);
-
-        sna[tid]->compute_dbidrj();
-        sna[tid]->copy_dbi2dbvec();
-
-        fij[0] = 0.0;
-        fij[1] = 0.0;
-        fij[2] = 0.0;
-
-        // linear contributions
-
-        for (k = 1; k <= ncoeff; k++) {
-          double bgb = coeffi[k];
-          fij[0] += bgb*sna[tid]->dbvec[k-1][0];
-          fij[1] += bgb*sna[tid]->dbvec[k-1][1];
-          fij[2] += bgb*sna[tid]->dbvec[k-1][2];
-        }
-
-      // quadratic contributions
-
-      if (quadraticflag) {
-        int k = ncoeff+1;
-        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          double bveci = sna[tid]->bvec[icoeff];
-          double fack = coeffi[k]*bveci;
-          double* dbveci = sna[tid]->dbvec[icoeff];
-          fij[0] += fack*sna[tid]->dbvec[icoeff][0];
-          fij[1] += fack*sna[tid]->dbvec[icoeff][1];
-          fij[2] += fack*sna[tid]->dbvec[icoeff][2];
-          k++;
-          for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-            double facki = coeffi[k]*bveci;
-            double fackj = coeffi[k]*sna[tid]->bvec[jcoeff];
-            double* dbvecj = sna[tid]->dbvec[jcoeff];
-            fij[0] += facki*dbvecj[0]+fackj*dbveci[0];
-            fij[1] += facki*dbvecj[1]+fackj*dbveci[1];
-            fij[2] += facki*dbvecj[2]+fackj*dbveci[2];
-            k++;
-          }
-        }
-      }
-
-#if defined(_OPENMP)
-#pragma omp critical
-#endif
-        {
-          f[i][0] += fij[0];
-          f[i][1] += fij[1];
-          f[i][2] += fij[2];
-          f[j][0] -= fij[0];
-          f[j][1] -= fij[1];
-          f[j][2] -= fij[2];
-
-          // tally per-atom virial contribution
-
-          if (vflag)
-            ev_tally_xyz(i,j,nlocal,newton_pair,0.0,0.0,
-                         fij[0],fij[1],fij[2],
-                         -sna[tid]->rij[jj][0],-sna[tid]->rij[jj][1],
-                         -sna[tid]->rij[jj][2]);
-        }
-      }
-
-      // evdwl = energy of atom I, sum over coeffs_k * Bi_k
-      // only call this for first pair of each atom i
-      // if atom has no pairs, eatom=0, which is wrong
-
-      if (eflag&&pairs[iijj][1] == 0) {
-        evdwl = coeffi[0];
-
-        if (!quadraticflag) {
-          sna[tid]->compute_bi();
-          sna[tid]->copy_bi2bvec();
-        }
-
-        // E = beta.B + 0.5*B^t.alpha.B
-        // coeff[k] = beta[k-1] or
-        // coeff[k] = alpha_ii or
-        // coeff[k] = alpha_ij = alpha_ji, j != i
-
-        // linear contributions
-
-        for (int k = 1; k <= ncoeff; k++)
-          evdwl += coeffi[k]*sna[tid]->bvec[k-1];
-
-        // quadratic contributions
-
-        if (quadraticflag) {
-          int k = ncoeff+1;
-          for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-            double bveci = sna[tid]->bvec[icoeff];
-            evdwl += 0.5*coeffi[k++]*bveci*bveci;
-            for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-              evdwl += coeffi[k++]*bveci*sna[tid]->bvec[jcoeff];
-            }
-          }
-        }
-
-#if defined(_OPENMP)
-#pragma omp critical
-#endif
-        ev_tally_full(i,2.0*evdwl,0.0,0.0,0.0,0.0,0.0);
-      }
-
-    }
-    if (time_dynamic || time_guided)
-      endtime = MPI_Wtime();
-    if (time_dynamic) schedule_time_dynamic = endtime - starttime;
-    if (time_guided) schedule_time_guided = endtime - starttime;
-    if (!use_shared_arrays) memory->destroy(pairs);
-
-  }// end of pragma omp parallel
-
-  if (vflag_fdotr) virial_fdotr_compute();
-
-}
-
 inline int PairSNAP::equal(double* x,double* y)
 {
   double dist2 =
@@ -1325,14 +905,61 @@ void PairSNAP::compute_beta()
 
 void PairSNAP::compute_bispectrum()
 {
-  int i;
+  int i,j,jnum,ninside;
+  double delx,dely,delz,rsq;
+  int *jlist,*numneigh,**firstneigh;
+
+  double **x = atom->x;
   int *type = atom->type;
+  class SNA* snaptr = sna[0];
 
   for (int ii = 0; ii < list->inum; ii++) {
     i = list->ilist[ii];
+
+    const double xtmp = x[i][0];
+    const double ytmp = x[i][1];
+    const double ztmp = x[i][2];
     const int itype = type[i];
     const int ielem = map[itype];
-    double* coeffi = coeffelem[ielem];
+    const double radi = radelem[ielem];
+
+    jlist = list->firstneigh[i];
+    jnum = list->numneigh[i];
+
+    // insure rij, inside, wj, and rcutij are of size jnum
+
+    snaptr->grow_rij(jnum);
+
+    // rij[][3] = displacements between atom I and those neighbors
+    // inside = indices of neighbors of I within cutoff
+    // wj = weights for neighbors of I within cutoff
+    // rcutij = cutoffs for neighbors of I within cutoff
+    // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi
+
+    ninside = 0;
+    for (int jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      delx = x[j][0] - xtmp;
+      dely = x[j][1] - ytmp;
+      delz = x[j][2] - ztmp;
+      rsq = delx*delx + dely*dely + delz*delz;
+      int jtype = type[j];
+      int jelem = map[jtype];
+
+      if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
+        snaptr->rij[ninside][0] = delx;
+        snaptr->rij[ninside][1] = dely;
+        snaptr->rij[ninside][2] = delz;
+        snaptr->inside[ninside] = j;
+        snaptr->wj[ninside] = wjelem[jelem];
+        snaptr->rcutij[ninside] = (radi + radelem[jelem])*rcutfac;
+        ninside++;
+      }
+    }
+
+    snaptr->compute_ui(ninside);
+    snaptr->compute_zi();
     snaptr->compute_bi();
     snaptr->copy_bi2bvec();
 
@@ -1470,6 +1097,7 @@ void PairSNAP::coeff(int narg, char **arg)
     memory->destroy(wjelem);
     memory->destroy(coeffelem);
     memory->destroy(beta);
+    memory->destroy(bispectrum);
   }
 
   char* type1 = arg[0];
diff --git a/src/SNAP/pair_snap.h b/src/SNAP/pair_snap.h
index 94d21162e2..1453076b23 100644
--- a/src/SNAP/pair_snap.h
+++ b/src/SNAP/pair_snap.h
@@ -56,6 +56,7 @@ protected:
   void build_per_atom_arrays();
 
   void compute_beta();
+  void compute_bispectrum();
 
   int schedule_user;
   double schedule_time_guided;
@@ -102,6 +103,7 @@ protected:
   double *wjelem;               // elements weights
   double **coeffelem;           // element bispectrum coefficients
   double** beta;                // betas for all atoms in list
+  double** bispectrum;          // bispectrum components for all atoms in list
   int *map;                     // mapping from atom types to elements
   int twojmax, diagonalstyle, switchflag, bzeroflag;
   double rfac0, rmin0, wj1, wj2;

From 51a6bfd579722cce8bc8f00de63ba08ff37ca9e4 Mon Sep 17 00:00:00 2001
From: Aidan Thompson <athomps@sandia.gov>
Date: Thu, 16 May 2019 22:01:45 -0600
Subject: [PATCH 09/21] Added bispectrum compute

---
 src/SNAP/pair_snap.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp
index f3b678971d..7c81eb0e8a 100644
--- a/src/SNAP/pair_snap.cpp
+++ b/src/SNAP/pair_snap.cpp
@@ -102,6 +102,8 @@ PairSNAP::PairSNAP(LAMMPS *lmp) : Pair(lmp)
   sna = NULL;
 
   beta_max = 0;
+  beta = NULL;
+  bispectrum = NULL;
 }
 
 /* ---------------------------------------------------------------------- */

From 803e0631c5e51ed5bdacebb6dbec9e5118d36954 Mon Sep 17 00:00:00 2001
From: Aidan Thompson <athomps@sandia.gov>
Date: Thu, 16 May 2019 22:11:06 -0600
Subject: [PATCH 10/21] Added bispectrum compute

---
 src/SNAP/pair_snap.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp
index 7c81eb0e8a..86e709ba03 100644
--- a/src/SNAP/pair_snap.cpp
+++ b/src/SNAP/pair_snap.cpp
@@ -119,8 +119,9 @@ PairSNAP::~PairSNAP()
     memory->destroy(radelem);
     memory->destroy(wjelem);
     memory->destroy(coeffelem);
-    memory->destroy(beta);
   }
+  memory->destroy(beta);
+  memory->destroy(bispectrum);
 
   // Need to set this because restart not handled by PairHybrid
 
@@ -1098,9 +1099,9 @@ void PairSNAP::coeff(int narg, char **arg)
     memory->destroy(radelem);
     memory->destroy(wjelem);
     memory->destroy(coeffelem);
-    memory->destroy(beta);
-    memory->destroy(bispectrum);
   }
+  memory->destroy(beta);
+  memory->destroy(bispectrum);
 
   char* type1 = arg[0];
   char* type2 = arg[1];

From 960a975e2a6c2125215b4945bc7abb1163a29e91 Mon Sep 17 00:00:00 2001
From: Aidan Thompson <athomps@sandia.gov>
Date: Mon, 3 Jun 2019 19:50:40 -0600
Subject: [PATCH 11/21] Added compact arrays, removed unused openmp stuff

---
 src/SNAP/compute_sna_atom.cpp  |  63 +--
 src/SNAP/compute_sna_atom.h    |   3 +-
 src/SNAP/compute_snad_atom.cpp | 101 ++---
 src/SNAP/compute_snad_atom.h   |   3 +-
 src/SNAP/compute_snav_atom.cpp | 112 ++---
 src/SNAP/compute_snav_atom.h   |   3 +-
 src/SNAP/openmp_snap.h         |  16 -
 src/SNAP/pair_snap.cpp         | 783 +--------------------------------
 src/SNAP/pair_snap.h           |  56 +--
 src/SNAP/sna.cpp               | 748 ++++++++-----------------------
 src/SNAP/sna.h                 |  58 +--
 11 files changed, 332 insertions(+), 1614 deletions(-)
 delete mode 100644 src/SNAP/openmp_snap.h

diff --git a/src/SNAP/compute_sna_atom.cpp b/src/SNAP/compute_sna_atom.cpp
index 5ca63a7e85..17774143d5 100644
--- a/src/SNAP/compute_sna_atom.cpp
+++ b/src/SNAP/compute_sna_atom.cpp
@@ -25,7 +25,6 @@
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
-#include "openmp_snap.h"
 
 using namespace LAMMPS_NS;
 
@@ -115,20 +114,10 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
     } else error->all(FLERR,"Illegal compute sna/atom command");
   }
 
-  nthreads = comm->nthreads;
-  snaptr = new SNA*[nthreads];
-#if defined(_OPENMP)
-#pragma omp parallel default(none) shared(lmp,rfac0,twojmax,rmin0,switchflag,bzeroflag)
-#endif
-  {
-    int tid = omp_get_thread_num();
+  snaptr = new SNA(lmp,rfac0,twojmax,diagonalstyle,
+                   rmin0,switchflag,bzeroflag);
 
-    // always unset use_shared_arrays since it does not work with computes
-    snaptr[tid] = new SNA(lmp,rfac0,twojmax,diagonalstyle,
-                          0 /*use_shared_arrays*/, rmin0,switchflag,bzeroflag);
-  }
-
-  ncoeff = snaptr[0]->ncoeff;
+  ncoeff = snaptr->ncoeff;
   size_peratom_cols = ncoeff;
   if (quadraticflag) size_peratom_cols += (ncoeff*(ncoeff+1))/2;
   peratom_flag = 1;
@@ -136,7 +125,6 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
   nmax = 0;
   njmax = 0;
   sna = NULL;
-
 }
 
 /* ---------------------------------------------------------------------- */
@@ -147,9 +135,7 @@ ComputeSNAAtom::~ComputeSNAAtom()
   memory->destroy(radelem);
   memory->destroy(wjelem);
   memory->destroy(cutsq);
-  for (int tid = 0; tid<nthreads; tid++)
-    delete snaptr[tid];
-  delete [] snaptr;
+  delete snaptr;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -176,13 +162,7 @@ void ComputeSNAAtom::init()
     if (strcmp(modify->compute[i]->style,"sna/atom") == 0) count++;
   if (count > 1 && comm->me == 0)
     error->warning(FLERR,"More than one compute sna/atom");
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    snaptr[tid]->init();
-  }
+  snaptr->init();
 }
 
 /* ---------------------------------------------------------------------- */
@@ -223,11 +203,7 @@ void ComputeSNAAtom::compute_peratom()
   double** const x = atom->x;
   const int* const mask = atom->mask;
 
-#if defined(_OPENMP)
-#pragma omp parallel for default(none)
-#endif
   for (int ii = 0; ii < inum; ii++) {
-    const int tid = omp_get_thread_num();
     const int i = ilist[ii];
     if (mask[i] & groupbit) {
 
@@ -241,7 +217,7 @@ void ComputeSNAAtom::compute_peratom()
 
       // insure rij, inside, and typej  are of size jnum
 
-      snaptr[tid]->grow_rij(jnum);
+      snaptr->grow_rij(jnum);
 
       // rij[][3] = displacements between atom I and those neighbors
       // inside = indices of neighbors of I within cutoff
@@ -258,26 +234,26 @@ void ComputeSNAAtom::compute_peratom()
         const double rsq = delx*delx + dely*dely + delz*delz;
         int jtype = type[j];
         if (rsq < cutsq[itype][jtype] && rsq>1e-20) {
-          snaptr[tid]->rij[ninside][0] = delx;
-          snaptr[tid]->rij[ninside][1] = dely;
-          snaptr[tid]->rij[ninside][2] = delz;
-          snaptr[tid]->inside[ninside] = j;
-          snaptr[tid]->wj[ninside] = wjelem[jtype];
-          snaptr[tid]->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
+          snaptr->rij[ninside][0] = delx;
+          snaptr->rij[ninside][1] = dely;
+          snaptr->rij[ninside][2] = delz;
+          snaptr->inside[ninside] = j;
+          snaptr->wj[ninside] = wjelem[jtype];
+          snaptr->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
           ninside++;
         }
       }
 
-      snaptr[tid]->compute_ui(ninside);
-      snaptr[tid]->compute_zi();
-      snaptr[tid]->compute_bi();
-      snaptr[tid]->copy_bi2bvec();
+      snaptr->compute_ui(ninside);
+      snaptr->compute_zi();
+      snaptr->compute_bi();
+      snaptr->copy_bi2bvec();
       for (int icoeff = 0; icoeff < ncoeff; icoeff++)
-        sna[i][icoeff] = snaptr[tid]->bvec[icoeff];
+        sna[i][icoeff] = snaptr->bvec[icoeff];
       if (quadraticflag) {
         int ncount = ncoeff;
         for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          double bi = snaptr[tid]->bvec[icoeff];
+          double bi = snaptr->bvec[icoeff];
 
           // diagonal element of quadratic matrix
 
@@ -286,7 +262,7 @@ void ComputeSNAAtom::compute_peratom()
           // upper-triangular elements of quadratic matrix
 
           for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++)
-            sna[i][ncount++] = bi*snaptr[tid]->bvec[jcoeff];
+            sna[i][ncount++] = bi*snaptr->bvec[jcoeff];
         }
       }
     } else {
@@ -305,7 +281,6 @@ double ComputeSNAAtom::memory_usage()
   double bytes = nmax*size_peratom_cols * sizeof(double);
   bytes += 3*njmax*sizeof(double);
   bytes += njmax*sizeof(int);
-  bytes += snaptr[0]->memory_usage()*comm->nthreads;
   return bytes;
 }
 
diff --git a/src/SNAP/compute_sna_atom.h b/src/SNAP/compute_sna_atom.h
index 2f6fb18996..56ffccfa7e 100644
--- a/src/SNAP/compute_sna_atom.h
+++ b/src/SNAP/compute_sna_atom.h
@@ -42,10 +42,9 @@ class ComputeSNAAtom : public Compute {
   double rcutfac;
   double *radelem;
   double *wjelem;
-  class SNA** snaptr;
+  class SNA* snaptr;
   double cutmax;
   int quadraticflag;
-  int nthreads;
 };
 
 }
diff --git a/src/SNAP/compute_snad_atom.cpp b/src/SNAP/compute_snad_atom.cpp
index b0395d5317..b356d61d3d 100644
--- a/src/SNAP/compute_snad_atom.cpp
+++ b/src/SNAP/compute_snad_atom.cpp
@@ -25,7 +25,6 @@
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
-#include "openmp_snap.h"
 
 using namespace LAMMPS_NS;
 
@@ -113,20 +112,10 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
     } else error->all(FLERR,"Illegal compute snad/atom command");
   }
 
-  nthreads = comm->nthreads;
-  snaptr = new SNA*[nthreads];
-#if defined(_OPENMP)
-#pragma omp parallel default(none) shared(lmp,rfac0,twojmax,rmin0,switchflag,bzeroflag)
-#endif
-  {
-    int tid = omp_get_thread_num();
+  snaptr = new SNA(lmp,rfac0,twojmax,diagonalstyle,
+                   rmin0,switchflag,bzeroflag);
 
-    // always unset use_shared_arrays since it does not work with computes
-    snaptr[tid] = new SNA(lmp,rfac0,twojmax,diagonalstyle,
-                          0 /*use_shared_arrays*/, rmin0,switchflag,bzeroflag);
-  }
-
-  ncoeff = snaptr[0]->ncoeff;
+  ncoeff = snaptr->ncoeff;
   nperdim = ncoeff;
   if (quadraticflag) nperdim += (ncoeff*(ncoeff+1))/2;
   yoffset = nperdim;
@@ -138,7 +127,6 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
   nmax = 0;
   njmax = 0;
   snad = NULL;
-
 }
 
 /* ---------------------------------------------------------------------- */
@@ -149,9 +137,7 @@ ComputeSNADAtom::~ComputeSNADAtom()
   memory->destroy(radelem);
   memory->destroy(wjelem);
   memory->destroy(cutsq);
-  for (int tid = 0; tid<nthreads; tid++)
-    delete snaptr[tid];
-  delete [] snaptr;
+  delete snaptr;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -178,13 +164,7 @@ void ComputeSNADAtom::init()
     if (strcmp(modify->compute[i]->style,"snad/atom") == 0) count++;
   if (count > 1 && comm->me == 0)
     error->warning(FLERR,"More than one compute snad/atom");
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    snaptr[tid]->init();
-  }
+  snaptr->init();
 }
 
 /* ---------------------------------------------------------------------- */
@@ -235,11 +215,7 @@ void ComputeSNADAtom::compute_peratom()
   double** const x = atom->x;
   const int* const mask = atom->mask;
 
-#if defined(_OPENMP)
-#pragma omp parallel for default(none)
-#endif
   for (int ii = 0; ii < inum; ii++) {
-    const int tid = omp_get_thread_num();
     const int i = ilist[ii];
     if (mask[i] & groupbit) {
 
@@ -258,7 +234,7 @@ void ComputeSNADAtom::compute_peratom()
 
       // insure rij, inside, and typej  are of size jnum
 
-      snaptr[tid]->grow_rij(jnum);
+      snaptr->grow_rij(jnum);
 
       // rij[][3] = displacements between atom I and those neighbors
       // inside = indices of neighbors of I within cutoff
@@ -276,30 +252,30 @@ void ComputeSNADAtom::compute_peratom()
         const double rsq = delx*delx + dely*dely + delz*delz;
         int jtype = type[j];
         if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-          snaptr[tid]->rij[ninside][0] = delx;
-          snaptr[tid]->rij[ninside][1] = dely;
-          snaptr[tid]->rij[ninside][2] = delz;
-          snaptr[tid]->inside[ninside] = j;
-          snaptr[tid]->wj[ninside] = wjelem[jtype];
-          snaptr[tid]->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
+          snaptr->rij[ninside][0] = delx;
+          snaptr->rij[ninside][1] = dely;
+          snaptr->rij[ninside][2] = delz;
+          snaptr->inside[ninside] = j;
+          snaptr->wj[ninside] = wjelem[jtype];
+          snaptr->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
           ninside++;
         }
       }
 
-      snaptr[tid]->compute_ui(ninside);
-      snaptr[tid]->compute_zi();
+      snaptr->compute_ui(ninside);
+      snaptr->compute_zi();
       if (quadraticflag) {
-        snaptr[tid]->compute_bi();
-        snaptr[tid]->copy_bi2bvec();
+        snaptr->compute_bi();
+        snaptr->copy_bi2bvec();
       }
 
       for (int jj = 0; jj < ninside; jj++) {
-        const int j = snaptr[tid]->inside[jj];
-        snaptr[tid]->compute_duidrj(snaptr[tid]->rij[jj],
-                                    snaptr[tid]->wj[jj],
-                                    snaptr[tid]->rcutij[jj]);
-        snaptr[tid]->compute_dbidrj();
-        snaptr[tid]->copy_dbi2dbvec();
+        const int j = snaptr->inside[jj];
+        snaptr->compute_duidrj(snaptr->rij[jj],
+                                    snaptr->wj[jj],
+                                    snaptr->rcutij[jj]);
+        snaptr->compute_dbidrj();
+        snaptr->copy_dbi2dbvec();
 
         // Accumulate -dBi/dRi, -dBi/dRj
 
@@ -307,12 +283,12 @@ void ComputeSNADAtom::compute_peratom()
         double *snadj = snad[j]+typeoffset;
 
         for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          snadi[icoeff] += snaptr[tid]->dbvec[icoeff][0];
-          snadi[icoeff+yoffset] += snaptr[tid]->dbvec[icoeff][1];
-          snadi[icoeff+zoffset] += snaptr[tid]->dbvec[icoeff][2];
-          snadj[icoeff] -= snaptr[tid]->dbvec[icoeff][0];
-          snadj[icoeff+yoffset] -= snaptr[tid]->dbvec[icoeff][1];
-          snadj[icoeff+zoffset] -= snaptr[tid]->dbvec[icoeff][2];
+          snadi[icoeff] += snaptr->dbvec[icoeff][0];
+          snadi[icoeff+yoffset] += snaptr->dbvec[icoeff][1];
+          snadi[icoeff+zoffset] += snaptr->dbvec[icoeff][2];
+          snadj[icoeff] -= snaptr->dbvec[icoeff][0];
+          snadj[icoeff+yoffset] -= snaptr->dbvec[icoeff][1];
+          snadj[icoeff+zoffset] -= snaptr->dbvec[icoeff][2];
         }
 
         if (quadraticflag) {
@@ -321,10 +297,10 @@ void ComputeSNADAtom::compute_peratom()
           snadj += quadraticoffset;
           int ncount = 0;
           for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-            double bi = snaptr[tid]->bvec[icoeff];
-            double bix = snaptr[tid]->dbvec[icoeff][0];
-            double biy = snaptr[tid]->dbvec[icoeff][1];
-            double biz = snaptr[tid]->dbvec[icoeff][2];
+            double bi = snaptr->bvec[icoeff];
+            double bix = snaptr->dbvec[icoeff][0];
+            double biy = snaptr->dbvec[icoeff][1];
+            double biz = snaptr->dbvec[icoeff][2];
 
             // diagonal elements of quadratic matrix
 
@@ -343,12 +319,12 @@ void ComputeSNADAtom::compute_peratom()
             // upper-triangular elements of quadratic matrix
 
             for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-              double dbxtmp = bi*snaptr[tid]->dbvec[jcoeff][0]
-                + bix*snaptr[tid]->bvec[jcoeff];
-              double dbytmp = bi*snaptr[tid]->dbvec[jcoeff][1]
-                + biy*snaptr[tid]->bvec[jcoeff];
-              double dbztmp = bi*snaptr[tid]->dbvec[jcoeff][2]
-                + biz*snaptr[tid]->bvec[jcoeff];
+              double dbxtmp = bi*snaptr->dbvec[jcoeff][0]
+                + bix*snaptr->bvec[jcoeff];
+              double dbytmp = bi*snaptr->dbvec[jcoeff][1]
+                + biy*snaptr->bvec[jcoeff];
+              double dbztmp = bi*snaptr->dbvec[jcoeff][2]
+                + biz*snaptr->bvec[jcoeff];
 
               snadi[ncount] +=         dbxtmp;
               snadi[ncount+yoffset] += dbytmp;
@@ -408,6 +384,5 @@ double ComputeSNADAtom::memory_usage()
   bytes += 3*njmax*sizeof(double);
   bytes += njmax*sizeof(int);
   bytes += 3*nperdim*atom->ntypes;
-  bytes += snaptr[0]->memory_usage()*comm->nthreads;
   return bytes;
 }
diff --git a/src/SNAP/compute_snad_atom.h b/src/SNAP/compute_snad_atom.h
index 92003a9bc5..1fcf540d7c 100644
--- a/src/SNAP/compute_snad_atom.h
+++ b/src/SNAP/compute_snad_atom.h
@@ -44,10 +44,9 @@ class ComputeSNADAtom : public Compute {
   double rcutfac;
   double *radelem;
   double *wjelem;
-  class SNA** snaptr;
+  class SNA* snaptr;
   double cutmax;
   int quadraticflag;
-  int nthreads;
 };
 
 }
diff --git a/src/SNAP/compute_snav_atom.cpp b/src/SNAP/compute_snav_atom.cpp
index b2d555f713..9f9ef7a67d 100644
--- a/src/SNAP/compute_snav_atom.cpp
+++ b/src/SNAP/compute_snav_atom.cpp
@@ -25,7 +25,6 @@
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
-#include "openmp_snap.h"
 
 using namespace LAMMPS_NS;
 
@@ -109,20 +108,10 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
     } else error->all(FLERR,"Illegal compute snav/atom command");
   }
 
-  nthreads = comm->nthreads;
-  snaptr = new SNA*[nthreads];
-#if defined(_OPENMP)
-#pragma omp parallel default(none) shared(lmp,rfac0,twojmax,rmin0,switchflag,bzeroflag)
-#endif
-  {
-    int tid = omp_get_thread_num();
+  snaptr = new SNA(lmp,rfac0,twojmax,diagonalstyle,
+                   rmin0,switchflag,bzeroflag);
 
-    // always unset use_shared_arrays since it does not work with computes
-    snaptr[tid] = new SNA(lmp,rfac0,twojmax,diagonalstyle,
-                          0 /*use_shared_arrays*/, rmin0,switchflag,bzeroflag);
-  }
-
-  ncoeff = snaptr[0]->ncoeff;
+  ncoeff = snaptr->ncoeff;
   nperdim = ncoeff;
   if (quadraticflag) nperdim += (ncoeff*(ncoeff+1))/2;
   size_peratom_cols = 6*nperdim*atom->ntypes;
@@ -144,9 +133,7 @@ ComputeSNAVAtom::~ComputeSNAVAtom()
   memory->destroy(wjelem);
   memory->destroy(cutsq);
 
-  for (int tid = 0; tid<nthreads; tid++)
-    delete snaptr[tid];
-  delete [] snaptr;
+  delete snaptr;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -174,13 +161,7 @@ void ComputeSNAVAtom::init()
     if (strcmp(modify->compute[i]->style,"snav/atom") == 0) count++;
   if (count > 1 && comm->me == 0)
     error->warning(FLERR,"More than one compute snav/atom");
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    snaptr[tid]->init();
-  }
+  snaptr->init();
 }
 
 /* ---------------------------------------------------------------------- */
@@ -230,11 +211,7 @@ void ComputeSNAVAtom::compute_peratom()
   double** const x = atom->x;
   const int* const mask = atom->mask;
 
-#if defined(_OPENMP)
-#pragma omp parallel for default(none)
-#endif
   for (int ii = 0; ii < inum; ii++) {
-    const int tid = omp_get_thread_num();
     const int i = ilist[ii];
     if (mask[i] & groupbit) {
 
@@ -251,7 +228,7 @@ void ComputeSNAVAtom::compute_peratom()
 
       // insure rij, inside, and typej  are of size jnum
 
-          snaptr[tid]->grow_rij(jnum);
+      snaptr->grow_rij(jnum);
 
       // rij[][3] = displacements between atom I and those neighbors
       // inside = indices of neighbors of I within cutoff
@@ -269,31 +246,31 @@ void ComputeSNAVAtom::compute_peratom()
         const double rsq = delx*delx + dely*dely + delz*delz;
         int jtype = type[j];
         if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-          snaptr[tid]->rij[ninside][0] = delx;
-          snaptr[tid]->rij[ninside][1] = dely;
-          snaptr[tid]->rij[ninside][2] = delz;
-          snaptr[tid]->inside[ninside] = j;
-          snaptr[tid]->wj[ninside] = wjelem[jtype];
-          snaptr[tid]->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
+          snaptr->rij[ninside][0] = delx;
+          snaptr->rij[ninside][1] = dely;
+          snaptr->rij[ninside][2] = delz;
+          snaptr->inside[ninside] = j;
+          snaptr->wj[ninside] = wjelem[jtype];
+          snaptr->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
           ninside++;
         }
       }
 
-      snaptr[tid]->compute_ui(ninside);
-      snaptr[tid]->compute_zi();
+      snaptr->compute_ui(ninside);
+      snaptr->compute_zi();
       if (quadraticflag) {
-        snaptr[tid]->compute_bi();
-        snaptr[tid]->copy_bi2bvec();
+        snaptr->compute_bi();
+        snaptr->copy_bi2bvec();
       }
 
       for (int jj = 0; jj < ninside; jj++) {
-        const int j = snaptr[tid]->inside[jj];
+        const int j = snaptr->inside[jj];
 
-        snaptr[tid]->compute_duidrj(snaptr[tid]->rij[jj],
-                                    snaptr[tid]->wj[jj],
-                                    snaptr[tid]->rcutij[jj]);
-        snaptr[tid]->compute_dbidrj();
-        snaptr[tid]->copy_dbi2dbvec();
+        snaptr->compute_duidrj(snaptr->rij[jj],
+                                    snaptr->wj[jj],
+                                    snaptr->rcutij[jj]);
+        snaptr->compute_dbidrj();
+        snaptr->copy_dbi2dbvec();
 
         // Accumulate -dBi/dRi*Ri, -dBi/dRj*Rj
 
@@ -301,18 +278,18 @@ void ComputeSNAVAtom::compute_peratom()
         double *snavj = snav[j]+typeoffset;
 
         for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          snavi[icoeff]           += snaptr[tid]->dbvec[icoeff][0]*xtmp;
-          snavi[icoeff+nperdim]   += snaptr[tid]->dbvec[icoeff][1]*ytmp;
-          snavi[icoeff+2*nperdim] += snaptr[tid]->dbvec[icoeff][2]*ztmp;
-          snavi[icoeff+3*nperdim] += snaptr[tid]->dbvec[icoeff][1]*ztmp;
-          snavi[icoeff+4*nperdim] += snaptr[tid]->dbvec[icoeff][0]*ztmp;
-          snavi[icoeff+5*nperdim] += snaptr[tid]->dbvec[icoeff][0]*ytmp;
-          snavj[icoeff]           -= snaptr[tid]->dbvec[icoeff][0]*x[j][0];
-          snavj[icoeff+nperdim]   -= snaptr[tid]->dbvec[icoeff][1]*x[j][1];
-          snavj[icoeff+2*nperdim] -= snaptr[tid]->dbvec[icoeff][2]*x[j][2];
-          snavj[icoeff+3*nperdim] -= snaptr[tid]->dbvec[icoeff][1]*x[j][2];
-          snavj[icoeff+4*nperdim] -= snaptr[tid]->dbvec[icoeff][0]*x[j][2];
-          snavj[icoeff+5*nperdim] -= snaptr[tid]->dbvec[icoeff][0]*x[j][1];
+          snavi[icoeff]           += snaptr->dbvec[icoeff][0]*xtmp;
+          snavi[icoeff+nperdim]   += snaptr->dbvec[icoeff][1]*ytmp;
+          snavi[icoeff+2*nperdim] += snaptr->dbvec[icoeff][2]*ztmp;
+          snavi[icoeff+3*nperdim] += snaptr->dbvec[icoeff][1]*ztmp;
+          snavi[icoeff+4*nperdim] += snaptr->dbvec[icoeff][0]*ztmp;
+          snavi[icoeff+5*nperdim] += snaptr->dbvec[icoeff][0]*ytmp;
+          snavj[icoeff]           -= snaptr->dbvec[icoeff][0]*x[j][0];
+          snavj[icoeff+nperdim]   -= snaptr->dbvec[icoeff][1]*x[j][1];
+          snavj[icoeff+2*nperdim] -= snaptr->dbvec[icoeff][2]*x[j][2];
+          snavj[icoeff+3*nperdim] -= snaptr->dbvec[icoeff][1]*x[j][2];
+          snavj[icoeff+4*nperdim] -= snaptr->dbvec[icoeff][0]*x[j][2];
+          snavj[icoeff+5*nperdim] -= snaptr->dbvec[icoeff][0]*x[j][1];
         }
 
         if (quadraticflag) {
@@ -321,10 +298,10 @@ void ComputeSNAVAtom::compute_peratom()
           snavj += quadraticoffset;
           int ncount = 0;
           for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-            double bi = snaptr[tid]->bvec[icoeff];
-            double bix = snaptr[tid]->dbvec[icoeff][0];
-            double biy = snaptr[tid]->dbvec[icoeff][1];
-            double biz = snaptr[tid]->dbvec[icoeff][2];
+            double bi = snaptr->bvec[icoeff];
+            double bix = snaptr->dbvec[icoeff][0];
+            double biy = snaptr->dbvec[icoeff][1];
+            double biz = snaptr->dbvec[icoeff][2];
 
             // diagonal element of quadratic matrix
 
@@ -348,12 +325,12 @@ void ComputeSNAVAtom::compute_peratom()
             // upper-triangular elements of quadratic matrix
 
             for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-              double dbxtmp = bi*snaptr[tid]->dbvec[jcoeff][0]
-                + bix*snaptr[tid]->bvec[jcoeff];
-              double dbytmp = bi*snaptr[tid]->dbvec[jcoeff][1]
-                + biy*snaptr[tid]->bvec[jcoeff];
-              double dbztmp = bi*snaptr[tid]->dbvec[jcoeff][2]
-                + biz*snaptr[tid]->bvec[jcoeff];
+              double dbxtmp = bi*snaptr->dbvec[jcoeff][0]
+                + bix*snaptr->bvec[jcoeff];
+              double dbytmp = bi*snaptr->dbvec[jcoeff][1]
+                + biy*snaptr->bvec[jcoeff];
+              double dbztmp = bi*snaptr->dbvec[jcoeff][2]
+                + biz*snaptr->bvec[jcoeff];
               snavi[ncount] +=           dbxtmp*xtmp;
               snavi[ncount+nperdim] +=   dbytmp*ytmp;
               snavi[ncount+2*nperdim] += dbztmp*ztmp;
@@ -419,6 +396,5 @@ double ComputeSNAVAtom::memory_usage()
   bytes += njmax*sizeof(int);
   bytes += 6*nperdim*atom->ntypes;
   if (quadraticflag) bytes += 6*nperdim*atom->ntypes;
-  bytes += snaptr[0]->memory_usage()*comm->nthreads;
   return bytes;
 }
diff --git a/src/SNAP/compute_snav_atom.h b/src/SNAP/compute_snav_atom.h
index 9be5e1d389..6bcce346e0 100644
--- a/src/SNAP/compute_snav_atom.h
+++ b/src/SNAP/compute_snav_atom.h
@@ -44,9 +44,8 @@ class ComputeSNAVAtom : public Compute {
   double rcutfac;
   double *radelem;
   double *wjelem;
-  class SNA** snaptr;
+  class SNA* snaptr;
   int quadraticflag;
-  int nthreads;
 };
 
 }
diff --git a/src/SNAP/openmp_snap.h b/src/SNAP/openmp_snap.h
deleted file mode 100644
index 60a3138c9c..0000000000
--- a/src/SNAP/openmp_snap.h
+++ /dev/null
@@ -1,16 +0,0 @@
-
-#ifndef LMP_OPENMP_SNAP_H
-#define LMP_OPENMP_SNAP_H
-
-#if defined(_OPENMP)
-#include <omp.h>
-#else
-enum omp_sched_t {omp_sched_static, omp_sched_dynamic, omp_sched_guided, omp_sched_auto};
-inline int omp_get_thread_num() { return 0;}
-inline int omp_set_num_threads(int num_threads) {return 1;}
-/* inline int __sync_fetch_and_add(int* ptr, int value) {int tmp = *ptr; ptr[0]+=value; return tmp;} */
-inline void omp_set_schedule(omp_sched_t schedule,int modifier=1) {}
-inline int omp_in_parallel() {return 0;}
-#endif
-
-#endif
diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp
index 86e709ba03..ae542e81b4 100644
--- a/src/SNAP/pair_snap.cpp
+++ b/src/SNAP/pair_snap.cpp
@@ -23,7 +23,6 @@
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "sna.h"
-#include "openmp_snap.h"
 #include "domain.h"
 #include "memory.h"
 #include "error.h"
@@ -55,51 +54,6 @@ PairSNAP::PairSNAP(LAMMPS *lmp) : Pair(lmp)
   coeffelem = NULL;
 
   nmax = 0;
-  nthreads = 1;
-
-  schedule_user = 0;
-  schedule_time_guided = -1;
-  schedule_time_dynamic = -1;
-  ncalls_neigh =-1;
-
-  ilistmask_max = 0;
-  ilistmask = NULL;
-  ghostinum = 0;
-  ghostilist_max = 0;
-  ghostilist = NULL;
-  ghostnumneigh_max = 0;
-  ghostnumneigh = NULL;
-  ghostneighs = NULL;
-  ghostfirstneigh = NULL;
-  ghostneighs_total = 0;
-  ghostneighs_max = 0;
-
-  i_max = 0;
-  i_neighmax = 0;
-  i_numpairs = 0;
-  i_rij = NULL;
-  i_inside = NULL;
-  i_wj = NULL;
-  i_rcutij = NULL;
-  i_ninside = NULL;
-  i_pairs = NULL;
-  i_uarraytot_r = NULL;
-  i_uarraytot_i = NULL;
-  i_zarray_r = NULL;
-  i_zarray_i = NULL;
-
-  use_shared_arrays = 0;
-
-#ifdef TIMING_INFO
-  timers[0] = 0;
-  timers[1] = 0;
-  timers[2] = 0;
-  timers[3] = 0;
-#endif
-
-  // Need to set this because restart not handled by PairHybrid
-
-  sna = NULL;
 
   beta_max = 0;
   beta = NULL;
@@ -123,35 +77,7 @@ PairSNAP::~PairSNAP()
   memory->destroy(beta);
   memory->destroy(bispectrum);
 
-  // Need to set this because restart not handled by PairHybrid
-
-  if (sna) {
-
-#ifdef TIMING_INFO
-    double time[5];
-    double timeave[5];
-    double timeave_mpi[5];
-    double timemax_mpi[5];
-
-    for (int i = 0; i < 5; i++) {
-      time[i] = 0;
-      timeave[i] = 0;
-      for (int tid = 0; tid<nthreads; tid++) {
-        if (sna[tid]->timers[i]>time[i])
-          time[i] = sna[tid]->timers[i];
-        timeave[i] += sna[tid]->timers[i];
-      }
-      timeave[i] /= nthreads;
-    }
-    MPI_Reduce(timeave, timeave_mpi, 5, MPI_DOUBLE, MPI_SUM, 0, world);
-    MPI_Reduce(time, timemax_mpi, 5, MPI_DOUBLE, MPI_MAX, 0, world);
-#endif
-
-    for (int tid = 0; tid<nthreads; tid++)
-      delete sna[tid];
-    delete [] sna;
-
-  }
+  delete snaptr;
 
   if (allocated) {
     memory->destroy(setflag);
@@ -161,22 +87,11 @@ PairSNAP::~PairSNAP()
 
 }
 
-void PairSNAP::compute(int eflag, int vflag)
-{
-//   if (use_optimized)
-//     compute_optimized(eflag, vflag);
-//   else
-
-// hard-code compute_regular()
- 
-    compute_regular(eflag, vflag);
-}
-
 /* ----------------------------------------------------------------------
    This version is a straightforward implementation
    ---------------------------------------------------------------------- */
 
-void PairSNAP::compute_regular(int eflag, int vflag)
+void PairSNAP::compute(int eflag, int vflag)
 {
   int i,j,jnum,ninside;
   double delx,dely,delz,evdwl,rsq;
@@ -191,7 +106,6 @@ void PairSNAP::compute_regular(int eflag, int vflag)
   int *type = atom->type;
   int nlocal = atom->nlocal;
   int newton_pair = force->newton_pair;
-  class SNA* snaptr = sna[0];
 
   if (beta_max < list->inum) {
     memory->grow(beta,list->inum,ncoeff,"PairSNAP:beta");
@@ -315,573 +229,6 @@ void PairSNAP::compute_regular(int eflag, int vflag)
   if (vflag_fdotr) virial_fdotr_compute();
 }
 
-inline int PairSNAP::equal(double* x,double* y)
-{
-  double dist2 =
-    (x[0]-y[0])*(x[0]-y[0]) +
-    (x[1]-y[1])*(x[1]-y[1]) +
-    (x[2]-y[2])*(x[2]-y[2]);
-  if (dist2 < 1e-20) return 1;
-  return 0;
-}
-
-inline double PairSNAP::dist2(double* x,double* y)
-{
-  return
-    (x[0]-y[0])*(x[0]-y[0]) +
-    (x[1]-y[1])*(x[1]-y[1]) +
-    (x[2]-y[2])*(x[2]-y[2]);
-}
-
-// return extra communication cutoff
-// extra_cutoff = max(subdomain_length)
-
-double PairSNAP::extra_cutoff()
-{
-  double sublo[3],subhi[3];
-
-  if (domain->triclinic == 0) {
-    for (int dim = 0 ; dim < 3 ; dim++) {
-      sublo[dim] = domain->sublo[dim];
-      subhi[dim] = domain->subhi[dim];
-    }
-  } else {
-    domain->lamda2x(domain->sublo_lamda,sublo);
-    domain->lamda2x(domain->subhi_lamda,subhi);
-  }
-
-  double sub_size[3];
-  for (int dim = 0; dim < 3; dim++)
-    sub_size[dim] = subhi[dim] - sublo[dim];
-
-  double max_sub_size = 0;
-  for (int dim = 0; dim < 3; dim++)
-    max_sub_size = MAX(max_sub_size,sub_size[dim]);
-
-  // note: for triclinic, probably need something different
-  // see Comm::setup()
-
-  return max_sub_size;
-}
-
-// micro load_balancer: each MPI process will
-// check with each of its 26 neighbors,
-// whether an imbalance exists in the number
-// of atoms to calculate forces for.
-// If it does it will set ilistmask of one of
-// its local atoms to zero, and send its Tag
-// to the neighbor process. The neighboring process
-// will check its ghost list for the
-// ghost atom with the same Tag which is closest
-// to its domain center, and build a
-// neighborlist for this ghost atom. For this to work,
-// the communication cutoff has to be
-// as large as the neighbor cutoff +
-// maximum subdomain length.
-
-// Note that at most one atom is exchanged per processor pair.
-
-// Also note that the local atom assignment
-// doesn't change. This load balancer will cause
-// some ghost atoms to have full neighborlists
-// which are unique to PairSNAP.
-// They are not part of the generally accessible neighborlist.
-// At the same time corresponding local atoms on
-// other MPI processes will not be
-// included in the force computation since
-// their ilistmask is 0. This does not effect
-// any other classes which might
-// access the same general neighborlist.
-// Reverse communication (newton on) of forces is required.
-
-// Currently the load balancer does two passes,
-// since its exchanging atoms upstream and downstream.
-
-void PairSNAP::load_balance()
-{
-  double sublo[3],subhi[3];
-  if (domain->triclinic == 0) {
-    double* sublotmp = domain->sublo;
-    double* subhitmp = domain->subhi;
-    for (int dim = 0 ; dim<3 ; dim++) {
-      sublo[dim]=sublotmp[dim];
-      subhi[dim]=subhitmp[dim];
-    }
-  } else {
-    double* sublotmp = domain->sublo_lamda;
-    double* subhitmp = domain->subhi_lamda;
-    domain->lamda2x(sublotmp,sublo);
-    domain->lamda2x(subhitmp,subhi);
-  }
-
-  //if (list->inum==0) list->grow(atom->nmax);
-
-  int nlocal = ghostinum;
-  for (int i=0; i < list->inum; i++)
-    if (ilistmask[i]) nlocal++;
-  int ***grid2proc = comm->grid2proc;
-  int* procgrid = comm->procgrid;
-
-  int nlocal_up,nlocal_down;
-  MPI_Request request;
-
-  double sub_mid[3];
-  for (int dim=0; dim<3; dim++)
-    sub_mid[dim] = (subhi[dim] + sublo[dim])/2;
-
-  if (comm->cutghostuser <
-      neighbor->cutneighmax+extra_cutoff())
-    error->all(FLERR,"Communication cutoff too small for SNAP micro load balancing");
-
-  int nrecv = ghostinum;
-  int totalsend = 0;
-  int nsend = 0;
-  int depth = 1;
-
-  for (int dx = -depth; dx < depth+1; dx++)
-    for (int dy = -depth; dy < depth+1; dy++)
-      for (int dz = -depth; dz < depth+1; dz++) {
-
-        if (dx == dy && dy == dz && dz == 0) continue;
-
-        int sendloc[3] = {comm->myloc[0],
-                          comm->myloc[1], comm->myloc[2]
-                         };
-        sendloc[0] += dx;
-        sendloc[1] += dy;
-        sendloc[2] += dz;
-        for (int dim = 0; dim < 3; dim++)
-          if (sendloc[dim] >= procgrid[dim])
-            sendloc[dim] = sendloc[dim] - procgrid[dim];
-        for (int dim = 0; dim < 3; dim++)
-          if (sendloc[dim] < 0)
-            sendloc[dim] = procgrid[dim] + sendloc[dim];
-        int recvloc[3] = {comm->myloc[0],
-                          comm->myloc[1], comm->myloc[2]
-                         };
-        recvloc[0] -= dx;
-        recvloc[1] -= dy;
-        recvloc[2] -= dz;
-        for (int dim = 0; dim < 3; dim++)
-          if (recvloc[dim] < 0)
-            recvloc[dim] = procgrid[dim] + recvloc[dim];
-        for (int dim = 0; dim < 3; dim++)
-          if (recvloc[dim] >= procgrid[dim])
-            recvloc[dim] = recvloc[dim] - procgrid[dim];
-
-        int sendproc = grid2proc[sendloc[0]][sendloc[1]][sendloc[2]];
-        int recvproc = grid2proc[recvloc[0]][recvloc[1]][recvloc[2]];
-
-        // two stage process, first upstream movement, then downstream
-
-        MPI_Sendrecv(&nlocal,1,MPI_INT,sendproc,0,
-                     &nlocal_up,1,MPI_INT,recvproc,0,world,MPI_STATUS_IGNORE);
-        MPI_Sendrecv(&nlocal,1,MPI_INT,recvproc,0,
-                     &nlocal_down,1,MPI_INT,sendproc,0,world,MPI_STATUS_IGNORE);
-        nsend = 0;
-
-        // send upstream
-
-        if (nlocal > nlocal_up+1) {
-
-          int i = totalsend++;
-          while(i < list->inum && ilistmask[i] == 0)
-            i = totalsend++;
-
-          if (i < list->inum)
-            MPI_Isend(&atom->tag[i],1,MPI_INT,recvproc,0,world,&request);
-          else {
-            int j = -1;
-            MPI_Isend(&j,1,MPI_INT,recvproc,0,world,&request);
-          }
-
-          if (i < list->inum) {
-            for (int j = 0; j < list->inum; j++)
-              if (list->ilist[j] == i)
-                ilistmask[j] = 0;
-            nsend = 1;
-          }
-        }
-
-        // recv downstream
-
-        if (nlocal < nlocal_down-1) {
-          nlocal++;
-          int get_tag = -1;
-          MPI_Recv(&get_tag,1,MPI_INT,sendproc,0,world,MPI_STATUS_IGNORE);
-
-          // if get_tag -1 the other process didnt have local atoms to send
-
-          if (get_tag >= 0) {
-            if (ghostinum >= ghostilist_max) {
-              memory->grow(ghostilist,ghostinum+10,
-                           "PairSnap::ghostilist");
-              ghostilist_max = ghostinum+10;
-            }
-            if (atom->nlocal + atom->nghost >= ghostnumneigh_max) {
-              ghostnumneigh_max = atom->nlocal+atom->nghost+100;
-              memory->grow(ghostnumneigh,ghostnumneigh_max,
-                           "PairSnap::ghostnumneigh");
-              memory->grow(ghostfirstneigh,ghostnumneigh_max,
-                           "PairSnap::ghostfirstneigh");
-            }
-
-            // find closest ghost image of the transfered particle
-
-            double mindist = 1e200;
-            int closestghost = -1;
-            for (int j = 0; j < atom->nlocal + atom->nghost; j++)
-              if (atom->tag[j] == get_tag)
-                if (dist2(sub_mid, atom->x[j]) < mindist) {
-                  closestghost = j;
-                  mindist = dist2(sub_mid, atom->x[j]);
-                }
-
-            // build neighborlist for this particular
-            // ghost atom, and add it to list->ilist
-
-            if (ghostneighs_max - ghostneighs_total <
-                neighbor->oneatom) {
-              memory->grow(ghostneighs,
-                           ghostneighs_total + neighbor->oneatom,
-                           "PairSnap::ghostneighs");
-              ghostneighs_max = ghostneighs_total + neighbor->oneatom;
-            }
-
-            int j = closestghost;
-
-            ghostilist[ghostinum] = j;
-            ghostnumneigh[j] = 0;
-            ghostfirstneigh[j] = ghostneighs_total;
-
-            ghostinum++;
-            int* jlist = ghostneighs + ghostfirstneigh[j];
-
-            // find all neighbors by looping
-            // over all local and ghost atoms
-
-            for (int k = 0; k < atom->nlocal + atom->nghost; k++)
-              if (dist2(atom->x[j],atom->x[k]) <
-                  neighbor->cutneighmax*neighbor->cutneighmax) {
-                jlist[ghostnumneigh[j]] = k;
-                ghostnumneigh[j]++;
-                ghostneighs_total++;
-              }
-          }
-
-          if (get_tag >= 0) nrecv++;
-        }
-
-        // decrease nlocal later, so that it is the
-        // initial number both for receiving and sending
-
-        if (nsend) nlocal--;
-
-        // second pass through the grid
-
-        MPI_Sendrecv(&nlocal,1,MPI_INT,sendproc,0,
-                     &nlocal_up,1,MPI_INT,recvproc,0,world,MPI_STATUS_IGNORE);
-        MPI_Sendrecv(&nlocal,1,MPI_INT,recvproc,0,
-                     &nlocal_down,1,MPI_INT,sendproc,0,world,MPI_STATUS_IGNORE);
-
-        // send downstream
-
-        nsend=0;
-        if (nlocal > nlocal_down+1) {
-          int i = totalsend++;
-          while(i < list->inum && ilistmask[i]==0) i = totalsend++;
-
-          if (i < list->inum)
-            MPI_Isend(&atom->tag[i],1,MPI_INT,sendproc,0,world,&request);
-          else {
-            int j =- 1;
-            MPI_Isend(&j,1,MPI_INT,sendproc,0,world,&request);
-          }
-
-          if (i < list->inum) {
-            for (int j=0; j<list->inum; j++)
-              if (list->ilist[j] == i) ilistmask[j] = 0;
-            nsend = 1;
-          }
-        }
-
-        // receive upstream
-
-        if (nlocal < nlocal_up-1) {
-          nlocal++;
-          int get_tag = -1;
-
-          MPI_Recv(&get_tag,1,MPI_INT,recvproc,0,world,MPI_STATUS_IGNORE);
-
-          if (get_tag >= 0) {
-            if (ghostinum >= ghostilist_max) {
-              memory->grow(ghostilist,ghostinum+10,
-                           "PairSnap::ghostilist");
-              ghostilist_max = ghostinum+10;
-            }
-            if (atom->nlocal + atom->nghost >= ghostnumneigh_max) {
-              ghostnumneigh_max = atom->nlocal + atom->nghost + 100;
-              memory->grow(ghostnumneigh,ghostnumneigh_max,
-                           "PairSnap::ghostnumneigh");
-              memory->grow(ghostfirstneigh,ghostnumneigh_max,
-                           "PairSnap::ghostfirstneigh");
-            }
-
-            // find closest ghost image of the transfered particle
-
-            double mindist = 1e200;
-            int closestghost = -1;
-            for (int j = 0; j < atom->nlocal + atom->nghost; j++)
-              if (atom->tag[j] == get_tag)
-                if (dist2(sub_mid,atom->x[j])<mindist) {
-                  closestghost = j;
-                  mindist = dist2(sub_mid,atom->x[j]);
-                }
-
-            // build neighborlist for this particular ghost atom
-
-            if (ghostneighs_max-ghostneighs_total < neighbor->oneatom) {
-              memory->grow(ghostneighs,ghostneighs_total + neighbor->oneatom,
-                           "PairSnap::ghostneighs");
-              ghostneighs_max = ghostneighs_total + neighbor->oneatom;
-            }
-
-            int j = closestghost;
-
-            ghostilist[ghostinum] = j;
-            ghostnumneigh[j] = 0;
-            ghostfirstneigh[j] = ghostneighs_total;
-
-            ghostinum++;
-            int* jlist = ghostneighs + ghostfirstneigh[j];
-
-            for (int k = 0; k < atom->nlocal + atom->nghost; k++)
-              if (dist2(atom->x[j],atom->x[k]) <
-                  neighbor->cutneighmax*neighbor->cutneighmax) {
-                jlist[ghostnumneigh[j]] = k;
-                ghostnumneigh[j]++;
-                ghostneighs_total++;
-              }
-          }
-
-          if (get_tag >= 0) nrecv++;
-        }
-        if (nsend) nlocal--;
-      }
-}
-
-void PairSNAP::set_sna_to_shared(int snaid,int i)
-{
-  sna[snaid]->rij = i_rij[i];
-  sna[snaid]->inside = i_inside[i];
-  sna[snaid]->wj = i_wj[i];
-  sna[snaid]->rcutij = i_rcutij[i];
-  sna[snaid]->zarray_r = i_zarray_r[i];
-  sna[snaid]->zarray_i = i_zarray_i[i];
-  sna[snaid]->uarraytot_r = i_uarraytot_r[i];
-  sna[snaid]->uarraytot_i = i_uarraytot_i[i];
-}
-
-void PairSNAP::build_per_atom_arrays()
-{
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&starttime);
-#endif
-
-  int count = 0;
-  int neighmax = 0;
-  for (int ii = 0; ii < list->inum; ii++)
-    if ((do_load_balance <= 0) || ilistmask[ii]) {
-      neighmax=MAX(neighmax,list->numneigh[list->ilist[ii]]);
-      ++count;
-    }
-  for (int ii = 0; ii < ghostinum; ii++) {
-    neighmax=MAX(neighmax,ghostnumneigh[ghostilist[ii]]);
-    ++count;
-  }
-
-  if (i_max < count || i_neighmax < neighmax) {
-    int i_maxt = MAX(count,i_max);
-    i_neighmax = MAX(neighmax,i_neighmax);
-    memory->destroy(i_rij);
-    memory->destroy(i_inside);
-    memory->destroy(i_wj);
-    memory->destroy(i_rcutij);
-    memory->destroy(i_ninside);
-    memory->destroy(i_pairs);
-    memory->create(i_rij,i_maxt,i_neighmax,3,"PairSNAP::i_rij");
-    memory->create(i_inside,i_maxt,i_neighmax,"PairSNAP::i_inside");
-    memory->create(i_wj,i_maxt,i_neighmax,"PairSNAP::i_wj");
-    memory->create(i_rcutij,i_maxt,i_neighmax,"PairSNAP::i_rcutij");
-    memory->create(i_ninside,i_maxt,"PairSNAP::i_ninside");
-    memory->create(i_pairs,i_maxt*i_neighmax,4,"PairSNAP::i_pairs");
-  }
-
-  if (i_max < count) {
-    int jdim = sna[0]->twojmax+1;
-    memory->destroy(i_uarraytot_r);
-    memory->destroy(i_uarraytot_i);
-    memory->create(i_uarraytot_r,count,jdim,jdim,jdim,
-                   "PairSNAP::i_uarraytot_r");
-    memory->create(i_uarraytot_i,count,jdim,jdim,jdim,
-                   "PairSNAP::i_uarraytot_i");
-    if (i_zarray_r != NULL)
-      for (int i = 0; i < i_max; i++) {
-        memory->destroy(i_zarray_r[i]);
-        memory->destroy(i_zarray_i[i]);
-      }
-
-    delete [] i_zarray_r;
-    delete [] i_zarray_i;
-    i_zarray_r = new double*****[count];
-    i_zarray_i = new double*****[count];
-    for (int i = 0; i < count; i++) {
-      memory->create(i_zarray_r[i],jdim,jdim,jdim,jdim,jdim,
-                     "PairSNAP::i_zarray_r");
-      memory->create(i_zarray_i[i],jdim,jdim,jdim,jdim,jdim,
-                     "PairSNAP::i_zarray_i");
-    }
-  }
-
-  if (i_max < count)
-    i_max = count;
-
-  count = 0;
-  i_numpairs = 0;
-  for (int ii = 0; ii < list->inum; ii++) {
-    if ((do_load_balance <= 0) || ilistmask[ii]) {
-      int i = list->ilist[ii];
-      int jnum = list->numneigh[i];
-      int* jlist = list->firstneigh[i];
-      const double xtmp = atom->x[i][0];
-      const double ytmp = atom->x[i][1];
-      const double ztmp = atom->x[i][2];
-      const int itype = atom->type[i];
-      const int ielem = map[itype];
-      const double radi = radelem[ielem];
-      int ninside = 0;
-      for (int jj = 0; jj < jnum; jj++) {
-        int j = jlist[jj];
-        j &= NEIGHMASK;
-        const double delx = atom->x[j][0] - xtmp;
-        const double dely = atom->x[j][1] - ytmp;
-        const double delz = atom->x[j][2] - ztmp;
-        const double rsq = delx*delx + dely*dely + delz*delz;
-        int jtype = atom->type[j];
-        int jelem = map[jtype];
-
-        i_pairs[i_numpairs][0] = i;
-        i_pairs[i_numpairs][1] = jj;
-        i_pairs[i_numpairs][2] = -1;
-        i_pairs[i_numpairs][3] = count;
-        if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-          i_rij[count][ninside][0] = delx;
-          i_rij[count][ninside][1] = dely;
-          i_rij[count][ninside][2] = delz;
-          i_inside[count][ninside] = j;
-          i_wj[count][ninside] = wjelem[jelem];
-          i_rcutij[count][ninside] = (radi + radelem[jelem])*rcutfac;
-
-          // update index list with inside index
-          i_pairs[i_numpairs][2] = ninside++;
-        }
-        i_numpairs++;
-      }
-      i_ninside[count] = ninside;
-      count++;
-    }
-  }
-
-  for (int ii = 0; ii < ghostinum; ii++) {
-    int i = ghostilist[ii];
-    int jnum = ghostnumneigh[i];
-    int* jlist = ghostneighs+ghostfirstneigh[i];
-    const double xtmp = atom->x[i][0];
-    const double ytmp = atom->x[i][1];
-    const double ztmp = atom->x[i][2];
-    const int itype = atom->type[i];
-    const int ielem = map[itype];
-    const double radi = radelem[ielem];
-    int ninside = 0;
-
-    for (int jj = 0; jj < jnum; jj++) {
-      int j = jlist[jj];
-      j &= NEIGHMASK;
-      const double delx = atom->x[j][0] - xtmp;
-      const double dely = atom->x[j][1] - ytmp;
-      const double delz = atom->x[j][2] - ztmp;
-      const double rsq = delx*delx + dely*dely + delz*delz;
-      int jtype = atom->type[j];
-      int jelem = map[jtype];
-
-      i_pairs[i_numpairs][0] = i;
-      i_pairs[i_numpairs][1] = jj;
-      i_pairs[i_numpairs][2] = -1;
-      i_pairs[i_numpairs][3] = count;
-      if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-        i_rij[count][ninside][0] = delx;
-        i_rij[count][ninside][1] = dely;
-        i_rij[count][ninside][2] = delz;
-        i_inside[count][ninside] = j;
-        i_wj[count][ninside] = wjelem[jelem];
-        i_rcutij[count][ninside] = (radi + radelem[jelem])*rcutfac;
-        // update index list with inside index
-        i_pairs[i_numpairs][2] = ninside++;
-      }
-      i_numpairs++;
-    }
-    i_ninside[count] = ninside;
-    count++;
-  }
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&endtime);
-  timers[0]+=(endtime.tv_sec-starttime.tv_sec+1.0*
-              (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-#endif
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&starttime);
-#endif
-
-#if defined(_OPENMP)
-#pragma omp parallel for shared(count) default(none)
-#endif
-  for (int ii=0; ii < count; ii++) {
-    int tid = omp_get_thread_num();
-    set_sna_to_shared(tid,ii);
-    //sna[tid]->compute_ui(i_ninside[ii]);
-#ifdef TIMING_INFO
-    clock_gettime(CLOCK_REALTIME,&starttime);
-#endif
-    sna[tid]->compute_ui_omp(i_ninside[ii],MAX(int(nthreads/count),1));
-#ifdef TIMING_INFO
-    clock_gettime(CLOCK_REALTIME,&endtime);
-    sna[tid]->timers[0]+=(endtime.tv_sec-starttime.tv_sec+1.0*
-                          (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-#endif
-  }
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&starttime);
-#endif
-  for (int ii=0; ii < count; ii++) {
-    int tid = 0;//omp_get_thread_num();
-    set_sna_to_shared(tid,ii);
-    sna[tid]->compute_zi_omp(MAX(int(nthreads/count),1));
-  }
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&endtime);
-  sna[0]->timers[1]+=(endtime.tv_sec-starttime.tv_sec+1.0*
-                      (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-#endif
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&endtime);
-  timers[1]+=(endtime.tv_sec-starttime.tv_sec+1.0*
-              (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-#endif
-}
-
 /* ----------------------------------------------------------------------
    compute beta
 ------------------------------------------------------------------------- */
@@ -914,7 +261,6 @@ void PairSNAP::compute_bispectrum()
 
   double **x = atom->x;
   int *type = atom->type;
-  class SNA* snaptr = sna[0];
 
   for (int ii = 0; ii < list->inum; ii++) {
     i = list->ilist[ii];
@@ -991,96 +337,8 @@ void PairSNAP::allocate()
 
 void PairSNAP::settings(int narg, char **arg)
 {
-
-  // set default values for optional arguments
-
-  nthreads = -1;
-  use_shared_arrays=-1;
-  do_load_balance = 0;
-  use_optimized = 1;
-
-  // optional arguments
-
-  for (int i=0; i < narg; i++) {
-    if (i+2>narg) error->all(FLERR,"Illegal pair_style command");
-    if (strcmp(arg[i],"nthreads")==0) {
-      nthreads=force->inumeric(FLERR,arg[++i]);
-#if defined(LMP_USER_OMP)
-      error->all(FLERR,"Must set number of threads via package omp command");
-#else
-      omp_set_num_threads(nthreads);
-      comm->nthreads=nthreads;
-#endif
-      continue;
-    }
-    if (strcmp(arg[i],"optimized")==0) {
-      use_optimized=force->inumeric(FLERR,arg[++i]);
-      continue;
-    }
-    if (strcmp(arg[i],"shared")==0) {
-      use_shared_arrays=force->inumeric(FLERR,arg[++i]);
-      continue;
-    }
-    if (strcmp(arg[i],"loadbalance")==0) {
-      do_load_balance = force->inumeric(FLERR,arg[++i]);
-      if (do_load_balance) {
-        double mincutoff = extra_cutoff() +
-          rcutmax + neighbor->skin;
-        if (comm->cutghostuser < mincutoff) {
-          char buffer[255];
-
-          //apparently mincutoff is 0 after sprintf command ?????
-
-          double tmp = mincutoff + 0.1;
-          sprintf(buffer, "Communication cutoff is too small "
-                  "for SNAP micro load balancing, increased to %lf",
-                  mincutoff+0.1);
-          if (comm->me==0)
-            error->warning(FLERR,buffer);
-
-          comm->cutghostuser = tmp;
-
-        }
-      }
-      continue;
-    }
-    if (strcmp(arg[i],"schedule")==0) {
-      i++;
-      if (strcmp(arg[i],"static")==0)
-        schedule_user = 1;
-      if (strcmp(arg[i],"dynamic")==0)
-        schedule_user = 2;
-      if (strcmp(arg[i],"guided")==0)
-        schedule_user = 3;
-      if (strcmp(arg[i],"auto")==0)
-        schedule_user = 4;
-      if (strcmp(arg[i],"determine")==0)
-        schedule_user = 5;
-      if (schedule_user == 0)
-        error->all(FLERR,"Illegal pair_style command");
-      continue;
-    }
+  for (int i=0; i < narg; i++)
     error->all(FLERR,"Illegal pair_style command");
-  }
-
-  if (nthreads < 0)
-    nthreads = comm->nthreads;
-
-  if (use_shared_arrays < 0) {
-    if (nthreads > 1 && atom->nlocal <= 2*nthreads)
-      use_shared_arrays = 1;
-    else use_shared_arrays = 0;
-  }
-
-  // check if running non-optimized code with
-  // optimization flags set
-
-  if (!use_optimized)
-    if (nthreads > 1 ||
-        use_shared_arrays ||
-        do_load_balance ||
-        schedule_user)
-      error->all(FLERR,"Illegal pair_style command");
 }
 
 /* ----------------------------------------------------------------------
@@ -1170,26 +428,14 @@ void PairSNAP::coeff(int narg, char **arg)
 
   if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 
-  sna = new SNA*[nthreads];
+  snaptr = new SNA(lmp,rfac0,twojmax,
+                   diagonalstyle,
+                   rmin0,switchflag,bzeroflag);
+  snaptr->grow_rij(nmax);
 
-  // allocate memory for per OpenMP thread data which
-  // is wrapped into the sna class
-
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    sna[tid] = new SNA(lmp,rfac0,twojmax,
-                       diagonalstyle,use_shared_arrays,
-                       rmin0,switchflag,bzeroflag);
-    if (!use_shared_arrays)
-      sna[tid]->grow_rij(nmax);
-  }
-
-  if (ncoeff != sna[0]->ncoeff) {
+  if (ncoeff != snaptr->ncoeff) {
     if (comm->me == 0)
-      printf("ncoeff = %d snancoeff = %d \n",ncoeff,sna[0]->ncoeff);
+      printf("ncoeff = %d snancoeff = %d \n",ncoeff,snaptr->ncoeff);
     error->all(FLERR,"Incorrect SNAP parameter file");
   }
 
@@ -1216,13 +462,7 @@ void PairSNAP::init_style()
   neighbor->requests[irequest]->half = 0;
   neighbor->requests[irequest]->full = 1;
 
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    sna[tid]->init();
-  }
+  snaptr->init();
 
 }
 
@@ -1370,6 +610,8 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename)
     }
   }
 
+  if (comm->me == 0) fclose(fpcoeff);
+
   // set flags for required keywords
 
   rcutfacflag = 0;
@@ -1471,7 +713,6 @@ double PairSNAP::memory_usage()
   bytes += nmax*sizeof(int);
   bytes += (2*ncoeffall)*sizeof(double);
   bytes += (ncoeff*3)*sizeof(double);
-  bytes += sna[0]->memory_usage()*nthreads;
   return bytes;
 }
 
diff --git a/src/SNAP/pair_snap.h b/src/SNAP/pair_snap.h
index 1453076b23..b5871c1527 100644
--- a/src/SNAP/pair_snap.h
+++ b/src/SNAP/pair_snap.h
@@ -29,8 +29,6 @@ public:
   PairSNAP(class LAMMPS *);
   ~PairSNAP();
   virtual void compute(int, int);
-  void compute_regular(int, int);
-  void compute_optimized(int, int);
   void settings(int, char **);
   virtual void coeff(int, char **);
   virtual void init_style();
@@ -43,59 +41,16 @@ public:
 protected:
   int ncoeffq, ncoeffall;
   double **bvec, ***dbvec;
-  class SNA** sna;
+  class SNA* snaptr;
   int nmax;
-  int nthreads;
   virtual void allocate();
   void read_files(char *, char *);
   inline int equal(double* x,double* y);
   inline double dist2(double* x,double* y);
-  double extra_cutoff();
-  void load_balance();
-  void set_sna_to_shared(int snaid,int i);
-  void build_per_atom_arrays();
 
   void compute_beta();
   void compute_bispectrum();
 
-  int schedule_user;
-  double schedule_time_guided;
-  double schedule_time_dynamic;
-
-  int ncalls_neigh;
-  int do_load_balance;
-  int ilistmask_max;
-  int* ilistmask;
-  int ghostinum;
-  int ghostilist_max;
-  int* ghostilist;
-  int ghostnumneigh_max;
-  int* ghostnumneigh;
-  int* ghostneighs;
-  int* ghostfirstneigh;
-  int ghostneighs_total;
-  int ghostneighs_max;
-
-  int use_optimized;
-  int use_shared_arrays;
-
-  int i_max;
-  int i_neighmax;
-  int i_numpairs;
-  int **i_pairs;
-  double ***i_rij;
-  int **i_inside;
-  double **i_wj;
-  double **i_rcutij;
-  int *i_ninside;
-  double ****i_uarraytot_r, ****i_uarraytot_i;
-  double ******i_zarray_r, ******i_zarray_i;
-
-#ifdef TIMING_INFO
-  //  timespec starttime, endtime;
-  double timers[4];
-#endif
-
   double rcutmax;               // max cutoff for all elements
   int nelements;                // # of unique elements
   char **elements;              // names of unique elements
@@ -130,15 +85,6 @@ Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.
 
-E: Must set number of threads via package omp command
-
-Because you are using the USER-OMP package, set the number of threads
-via its settings, not by the pair_style snap nthreads setting.
-
-W: Communication cutoff is too small for SNAP micro load balancing, increased to %lf
-
-Self-explanatory.
-
 E: Incorrect args for pair coefficients
 
 Self-explanatory.  Check the input script or data file.
diff --git a/src/SNAP/sna.cpp b/src/SNAP/sna.cpp
index b729e4d0d6..b388136caf 100644
--- a/src/SNAP/sna.cpp
+++ b/src/SNAP/sna.cpp
@@ -21,7 +21,6 @@
 #include "math_extra.h"
 #include <cstring>
 #include <cstdlib>
-#include "openmp_snap.h"
 
 #include "memory.h"
 #include "error.h"
@@ -114,12 +113,11 @@ using namespace MathConst;
 ------------------------------------------------------------------------- */
 
 SNA::SNA(LAMMPS* lmp, double rfac0_in,
-         int twojmax_in, int diagonalstyle_in, int use_shared_arrays_in,
+         int twojmax_in, int diagonalstyle_in,
          double rmin0_in, int switch_flag_in, int bzero_flag_in) : Pointers(lmp)
 {
   wself = 1.0;
 
-  use_shared_arrays = use_shared_arrays_in;
   rfac0 = rfac0_in;
   rmin0 = rmin0_in;
   switch_flag = switch_flag_in;
@@ -141,7 +139,8 @@ SNA::SNA(LAMMPS* lmp, double rfac0_in,
   wj = NULL;
   rcutij = NULL;
   nmax = 0;
-  idxj = NULL;
+  idxz = NULL;
+  idxb= NULL;
 
   if (bzero_flag) {
     double www = wself*wself*wself;
@@ -149,133 +148,178 @@ SNA::SNA(LAMMPS* lmp, double rfac0_in,
       bzero[j] = www*(j+1);
   }
 
-#ifdef TIMING_INFO
-  timers = new double[20];
-  for(int i = 0; i < 20; i++) timers[i] = 0;
-  print = 0;
-  counter = 0;
-#endif
-
   build_indexlist();
-
-
 }
 
 /* ---------------------------------------------------------------------- */
 
 SNA::~SNA()
 {
-  if(!use_shared_arrays) {
-    destroy_twojmax_arrays();
-    memory->destroy(rij);
-    memory->destroy(inside);
-    memory->destroy(wj);
-    memory->destroy(rcutij);
-    memory->destroy(bvec);
-    memory->destroy(dbvec);
-  }
-  delete[] idxj;
+  destroy_twojmax_arrays();
+  memory->destroy(rij);
+  memory->destroy(inside);
+  memory->destroy(wj);
+  memory->destroy(rcutij);
+  memory->destroy(bvec);
+  memory->destroy(dbvec);
+  delete[] idxz;
+  delete[] idxb;
 }
 
 void SNA::build_indexlist()
 {
-  if(diagonalstyle == 0) {
-    int idxj_count = 0;
+  if(diagonalstyle != 3)
+    error->all(FLERR, "diagonal_style must be 3\n");
 
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
-          idxj_count++;
+  // index list for cglist
 
-    // indexList can be changed here
+  int jdim = twojmax + 1;
+  memory->create(idxcg_block, jdim, jdim, jdim,
+                 "sna:idxcg_block");
 
-    idxj = new SNA_LOOPINDICES[idxj_count];
-    idxj_max = idxj_count;
+  int idxcg_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+        idxcg_block[j1][j2][j] = idxcg_count; 
+        for (int m1 = 0; m1 <= j1; m1++)
+          for (int m2 = 0; m2 <= j2; m2++)
+            idxcg_count++;
+      }
+  idxcg_max = idxcg_count;
 
-    idxj_count = 0;
+  // index list for uarray
+  // need to include both halves
 
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
-          idxj[idxj_count].j1 = j1;
-          idxj[idxj_count].j2 = j2;
-          idxj[idxj_count].j = j;
-          idxj_count++;
+  memory->create(idxu_block, jdim,
+                 "sna:idxu_block");
+
+  int idxu_count = 0;
+  
+  for(int j = 0; j <= twojmax; j++) {
+    idxu_block[j] = idxu_count; 
+    for(int mb = 0; mb <= j; mb++)
+      for(int ma = 0; ma <= j; ma++)
+        idxu_count++;
+  }
+  idxu_max = idxu_count;
+
+  // index list for beta and B
+
+  int idxb_count = 0;  
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
+        if (j >= j1) idxb_count++;
+  
+  idxb_max = idxb_count;
+  idxb = new SNA_BINDICES[idxb_max];
+  
+  idxb_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
+        if (j >= j1) {
+          idxb[idxb_count].j1 = j1;
+          idxb[idxb_count].j2 = j2;
+          idxb[idxb_count].j = j;
+          idxb_count++;
         }
-  }
 
-  if(diagonalstyle == 1) {
-    int idxj_count = 0;
+  // reverse index list for beta and b
 
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j = 0; j <= MIN(twojmax, 2 * j1); j += 2) {
-        idxj_count++;
+  memory->create(idxb_block, jdim, jdim, jdim,
+                 "sna:idxb_block");
+  idxb_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+        if (j < j1) continue;
+        idxb_block[j1][j2][j] = idxb_count; 
+        idxb_count++;
       }
 
-    // indexList can be changed here
+  // index list for zlist
 
-    idxj = new SNA_LOOPINDICES[idxj_count];
-    idxj_max = idxj_count;
+  int idxz_count = 0;
 
-    idxj_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
+        for (int mb = 0; 2*mb <= j; mb++)
+          for (int ma = 0; ma <= j; ma++)
+            idxz_count++;
+  
+  idxz_max = idxz_count;
+  idxz = new SNA_ZINDICES[idxz_max];
+  
+  memory->create(idxz_block, jdim, jdim, jdim,
+                 "sna:idxz_block");
+  
+  idxz_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+        idxz_block[j1][j2][j] = idxz_count; 
 
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j = 0; j <= MIN(twojmax, 2 * j1); j += 2) {
-        idxj[idxj_count].j1 = j1;
-        idxj[idxj_count].j2 = j1;
-        idxj[idxj_count].j = j;
-        idxj_count++;
-      }
-  }
+        // find right beta[jjb] entry
+        // multiply and divide by j+1 factors
+        // account for multiplicity of 1, 2, or 3
 
-  if(diagonalstyle == 2) {
-    int idxj_count = 0;
+        // CODE HORROR!! Need to figure this out later
+        double betaj = 1.0;
+// #ifdef USE_YDIRECT_ZLIST
+//         double betaj; 
+//         if (j >= j1) {
+//           const int jjb = idxb_block[j1][j2][j];
+//           if (j1 == j) {
+//             if (j2 == j) betaj = 3*beta[jjb];
+//             else betaj = 2*beta[jjb];
+//           } else betaj = beta[jjb]; 
+//         } else if (j >= j2) {
+//           const int jjb = idxb_block[j][j2][j1];
+//           if (j2 == j) betaj = 2*beta[jjb]*(j1+1)/(j+1.0);
+//           else betaj = beta[jjb]*(j1+1)/(j+1.0);
+//         } else {
+//           const int jjb = idxb_block[j2][j][j1];
+//           betaj = beta[jjb]*(j1+1)/(j+1.0); 
+//         }
+// #else
+//         double betaj; 
+//         if (j >= j1) {
+//           const int jjb = idxb_block[j1][j2][j];
+//           betaj = beta[jjb]; 
+//         } else if (j >= j2) {
+//           const int jjb = idxb_block[j][j2][j1];
+//           betaj = beta[jjb]*(j1+1)/(j+1.0);
+//         } else {
+//           const int jjb = idxb_block[j2][j][j1];
+//           betaj = beta[jjb]*(j1+1)/(j+1.0); 
+//         }
+// #endif
 
-    for(int j1 = 0; j1 <= twojmax; j1++) {
-      idxj_count++;
-    }
+        for (int mb = 0; 2*mb <= j; mb++)
+          for (int ma = 0; ma <= j; ma++) {
+            idxz[idxz_count].j1 = j1;
+            idxz[idxz_count].j2 = j2;
+            idxz[idxz_count].j = j;
+            idxz[idxz_count].ma1min = MAX(0, (2 * ma - j - j2 + j1) / 2);
+            idxz[idxz_count].ma2max = (2 * ma - j - (2 * idxz[idxz_count].ma1min - j1) + j2) / 2;
+            idxz[idxz_count].na = MIN(j1, (2 * ma - j + j2 + j1) / 2) - idxz[idxz_count].ma1min + 1;
+            idxz[idxz_count].mb1min = MAX(0, (2 * mb - j - j2 + j1) / 2);
+            idxz[idxz_count].mb2max = (2 * mb - j - (2 * idxz[idxz_count].mb1min - j1) + j2) / 2;
+            idxz[idxz_count].nb = MIN(j1, (2 * mb - j + j2 + j1) / 2) - idxz[idxz_count].mb1min + 1;
 
-    // indexList can be changed here
+            // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
+            // find right beta[jjb] and y_list[jju] entries
 
-    idxj = new SNA_LOOPINDICES[idxj_count];
-    idxj_max = idxj_count;
+            const int jju = idxu_block[j] + (j+1)*mb + ma;
+            idxz[idxz_count].jju = jju;
+            idxz[idxz_count].betaj = betaj;
 
-    idxj_count = 0;
-
-    for(int j1 = 0; j1 <= twojmax; j1++) {
-      idxj[idxj_count].j1 = j1;
-      idxj[idxj_count].j2 = j1;
-      idxj[idxj_count].j = j1;
-      idxj_count++;
-    }
-  }
-
-  if(diagonalstyle == 3) {
-    int idxj_count = 0;
-
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
-          if (j >= j1) idxj_count++;
-
-    // indexList can be changed here
-
-    idxj = new SNA_LOOPINDICES[idxj_count];
-    idxj_max = idxj_count;
-
-    idxj_count = 0;
-
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
-          if (j >= j1) {
-            idxj[idxj_count].j1 = j1;
-            idxj[idxj_count].j2 = j2;
-            idxj[idxj_count].j = j;
-            idxj_count++;
+            idxz_count++;
           }
-  }
-
+      }
 }
 /* ---------------------------------------------------------------------- */
 
@@ -292,16 +336,14 @@ void SNA::grow_rij(int newnmax)
 
   nmax = newnmax;
 
-  if(!use_shared_arrays) {
-    memory->destroy(rij);
-    memory->destroy(inside);
-    memory->destroy(wj);
-    memory->destroy(rcutij);
-    memory->create(rij, nmax, 3, "pair:rij");
-    memory->create(inside, nmax, "pair:inside");
-    memory->create(wj, nmax, "pair:wj");
-    memory->create(rcutij, nmax, "pair:rcutij");
- }
+  memory->destroy(rij);
+  memory->destroy(inside);
+  memory->destroy(wj);
+  memory->destroy(rcutij);
+  memory->create(rij, nmax, 3, "pair:rij");
+  memory->create(inside, nmax, "pair:inside");
+  memory->create(wj, nmax, "pair:wj");
+  memory->create(rcutij, nmax, "pair:rcutij");
 }
 /* ----------------------------------------------------------------------
    compute Ui by summing over neighbors j
@@ -320,10 +362,6 @@ void SNA::compute_ui(int jnum)
   zero_uarraytot();
   addself_uarraytot(wself);
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
-
   for(int j = 0; j < jnum; j++) {
     x = rij[j][0];
     y = rij[j][1];
@@ -339,48 +377,6 @@ void SNA::compute_ui(int jnum)
     add_uarraytot(r, wj[j], rcutij[j]);
   }
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[0] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
-
-}
-
-void SNA::compute_ui_omp(int jnum, int sub_threads)
-{
-  double rsq, r, x, y, z, z0, theta0;
-
-  // utot(j,ma,mb) = 0 for all j,ma,ma
-  // utot(j,ma,ma) = 1 for all j,ma
-  // for j in neighbors of i:
-  //   compute r0 = (x,y,z,z0)
-  //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
-
-  zero_uarraytot();
-  addself_uarraytot(wself);
-
-  for(int j = 0; j < jnum; j++) {
-    x = rij[j][0];
-    y = rij[j][1];
-    z = rij[j][2];
-    rsq = x * x + y * y + z * z;
-    r = sqrt(rsq);
-    theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij[j] - rmin0);
-    //    theta0 = (r - rmin0) * rscale0;
-    z0 = r / tan(theta0);
-    omp_set_num_threads(sub_threads);
-
-#if defined(_OPENMP)
-#pragma omp parallel shared(x,y,z,z0,r,sub_threads) default(none)
-#endif
-    {
-      compute_uarray_omp(x, y, z, z0, r, sub_threads);
-    }
-    add_uarraytot(r, wj[j], rcutij[j]);
-  }
-
-
 }
 
 /* ----------------------------------------------------------------------
@@ -389,24 +385,6 @@ void SNA::compute_ui_omp(int jnum, int sub_threads)
 
 void SNA::compute_zi()
 {
-  // for j1 = 0,...,twojmax
-  //   for j2 = 0,twojmax
-  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
-  //        for ma = 0,...,j
-  //          for mb = 0,...,jmid
-  //            z(j1,j2,j,ma,mb) = 0
-  //            for ma1 = Max(0,ma+(j1-j2-j)/2),Min(j1,ma+(j1+j2-j)/2)
-  //              sumb1 = 0
-  //              ma2 = ma-ma1+(j1+j2-j)/2;
-  //              for mb1 = Max(0,mb+(j1-j2-j)/2),Min(j1,mb+(j1+j2-j)/2)
-  //                mb2 = mb-mb1+(j1+j2-j)/2;
-  //                sumb1 += cg(j1,mb1,j2,mb2,j) *
-  //                  u(j1,ma1,mb1) * u(j2,ma2,mb2)
-  //              z(j1,j2,j,ma,mb) += sumb1*cg(j1,ma1,j2,ma2,j)
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
 
   // compute_dbidrj() requires full j1/j2/j chunk of z elements
   // use zarray j1/j2 symmetry
@@ -449,84 +427,13 @@ void SNA::compute_zi()
       } // end loop over j
     } // end loop over j1, j2
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[1] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
-}
-
-void SNA::compute_zi_omp(int sub_threads)
-{
-  // for j1 = 0,...,twojmax
-  //   for j2 = 0,twojmax
-  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
-  //        for ma = 0,...,j
-  //          for mb = 0,...,j
-  //            z(j1,j2,j,ma,mb) = 0
-  //            for ma1 = Max(0,ma+(j1-j2-j)/2),Min(j1,ma+(j1+j2-j)/2)
-  //              sumb1 = 0
-  //              ma2 = ma-ma1+(j1+j2-j)/2;
-  //              for mb1 = Max(0,mb+(j1-j2-j)/2),Min(j1,mb+(j1+j2-j)/2)
-  //                mb2 = mb-mb1+(j1+j2-j)/2;
-  //                sumb1 += cg(j1,mb1,j2,mb2,j) *
-  //                  u(j1,ma1,mb1) * u(j2,ma2,mb2)
-  //              z(j1,j2,j,ma,mb) += sumb1*cg(j1,ma1,j2,ma2,j)
-
-  if(omp_in_parallel())
-    omp_set_num_threads(sub_threads);
-
-  // compute_dbidrj() requires full j1/j2/j chunk of z elements
-  // use zarray j1/j2 symmetry
-
-#if defined(_OPENMP)
-#pragma omp parallel for schedule(auto) default(none)
-#endif
-  for(int j1 = 0; j1 <= twojmax; j1++)
-    for(int j2 = 0; j2 <= j1; j2++)
-      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
-
-    double sumb1_r, sumb1_i;
-    int ma2, mb2;
-
-    for(int ma = 0; ma <= j; ma++)
-      for(int mb = 0; mb <= j; mb++) {
-        zarray_r[j1][j2][j][ma][mb] = 0.0;
-        zarray_i[j1][j2][j][ma][mb] = 0.0;
-
-        for(int ma1 = MAX(0, (2 * ma - j - j2 + j1) / 2);
-            ma1 <= MIN(j1, (2 * ma - j + j2 + j1) / 2); ma1++) {
-          sumb1_r = 0.0;
-          sumb1_i = 0.0;
-
-          ma2 = (2 * ma - j - (2 * ma1 - j1) + j2) / 2;
-
-          for(int mb1 = MAX(0, (2 * mb - j - j2 + j1) / 2);
-              mb1 <= MIN(j1, (2 * mb - j + j2 + j1) / 2); mb1++) {
-
-            mb2 = (2 * mb - j - (2 * mb1 - j1) + j2) / 2;
-            sumb1_r += cgarray[j1][j2][j][mb1][mb2] *
-              (uarraytot_r[j1][ma1][mb1] * uarraytot_r[j2][ma2][mb2] -
-               uarraytot_i[j1][ma1][mb1] * uarraytot_i[j2][ma2][mb2]);
-            sumb1_i += cgarray[j1][j2][j][mb1][mb2] *
-              (uarraytot_r[j1][ma1][mb1] * uarraytot_i[j2][ma2][mb2] +
-               uarraytot_i[j1][ma1][mb1] * uarraytot_r[j2][ma2][mb2]);
-          }
-
-          zarray_r[j1][j2][j][ma][mb] +=
-            sumb1_r * cgarray[j1][j2][j][ma1][ma2];
-          zarray_i[j1][j2][j][ma][mb] +=
-            sumb1_i * cgarray[j1][j2][j][ma1][ma2];
-        }
-      }
-  }
 }
 
 /* ----------------------------------------------------------------------
    compute Yi by summing over products of beta and Zi
 ------------------------------------------------------------------------- */
 
-void SNA::compute_yi(double* beta)
+void SNA::compute_yi(const double* beta)
 {
   int j;
   int idxz_count;
@@ -540,18 +447,18 @@ void SNA::compute_yi(double* beta)
       } // end loop over ma, mb
   } // end loop over j
 
-  for(int JJ = 0; JJ < idxj_max; JJ++) {
-    const int j1 = idxj[JJ].j1;
-    const int j2 = idxj[JJ].j2;
-    const int j3 = idxj[JJ].j;
+  for(int jjb = 0; jjb < idxb_max; jjb++) {
+    const int j1 = idxb[jjb].j1;
+    const int j2 = idxb[jjb].j2;
+    const int j3 = idxb[jjb].j;
 
     j = j3;
     jjjzarray_r = zarray_r[j1][j2][j3];
     jjjzarray_i = zarray_i[j1][j2][j3];
     for(int mb = 0; 2*mb <= j; mb++)
       for(int ma = 0; ma <= j; ma++) {
-        yarray_r[j][ma][mb] += beta[JJ]*jjjzarray_r[ma][mb];
-        yarray_i[j][ma][mb] += beta[JJ]*jjjzarray_i[ma][mb];
+        yarray_r[j][ma][mb] += beta[jjb]*jjjzarray_r[ma][mb];
+        yarray_i[j][ma][mb] += beta[jjb]*jjjzarray_i[ma][mb];
       } // end loop over ma, mb
 
     j = j1;
@@ -560,8 +467,8 @@ void SNA::compute_yi(double* beta)
     double j1fac = (j3+1)/(j+1.0);
     for(int mb = 0; 2*mb <= j; mb++)
       for(int ma = 0; ma <= j; ma++) {
-        yarray_r[j][ma][mb] += beta[JJ]*jjjzarray_r[ma][mb]*j1fac;
-        yarray_i[j][ma][mb] += beta[JJ]*jjjzarray_i[ma][mb]*j1fac;
+        yarray_r[j][ma][mb] += beta[jjb]*jjjzarray_r[ma][mb]*j1fac;
+        yarray_i[j][ma][mb] += beta[jjb]*jjjzarray_i[ma][mb]*j1fac;
       } // end loop over ma, mb
 
     j = j2;
@@ -570,8 +477,8 @@ void SNA::compute_yi(double* beta)
     double j2fac = (j3+1)/(j+1.0);
     for(int mb = 0; 2*mb <= j; mb++)
       for(int ma = 0; ma <= j; ma++) {
-        yarray_r[j][ma][mb] += beta[JJ]*jjjzarray_r[ma][mb]*j2fac;
-        yarray_i[j][ma][mb] += beta[JJ]*jjjzarray_i[ma][mb]*j2fac;
+        yarray_r[j][ma][mb] += beta[jjb]*jjjzarray_r[ma][mb]*j2fac;
+        yarray_i[j][ma][mb] += beta[jjb]*jjjzarray_i[ma][mb]*j2fac;
       } // end loop over ma, mb
 
   } // end loop over jjb
@@ -655,10 +562,6 @@ void SNA::compute_bi()
   //            b(j1,j2,j) +=
   //              2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb)
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
-
   for(int j1 = 0; j1 <= twojmax; j1++)
     for(int j2 = 0; j2 <= j1; j2++) {
       for(int j = abs(j1 - j2);
@@ -691,12 +594,6 @@ void SNA::compute_bi()
       }
     }
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[2] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
-
 }
 
 /* ----------------------------------------------------------------------
@@ -760,164 +657,8 @@ void SNA::compute_duidrj(double* rij, double wj, double rcut)
   z0 = r * cs / sn;
   dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
-
   compute_duarray(x, y, z, z0, r, dz0dr, wj, rcut);
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[3] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
-
-}
-
-/* ----------------------------------------------------------------------
-   calculate derivative of Bi w.r.t. atom j
-   variant using indexlist for j1,j2,j
-   variant not using symmetry relation
-------------------------------------------------------------------------- */
-
-void SNA::compute_dbidrj_nonsymm()
-{
-  // for j1 = 0,...,twojmax
-  //   for j2 = 0,twojmax
-  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
-  //        dbdr(j1,j2,j) = 0
-  //        for ma = 0,...,j
-  //          for mb = 0,...,j
-  //            dzdr = 0
-  //            for ma1 = Max(0,ma+(j1-j2-j)/2),Min(j1,ma+(j1+j2-j)/2)
-  //              sumb1 = 0
-  //              ma2 = ma-ma1+(j1+j2-j)/2;
-  //              for mb1 = Max(0,mb+(j1-j2-j)/2),Min(j1,mb+(j1+j2-j)/2)
-  //                mb2 = mb-mb1+(j1+j2-j)/2;
-  //                sumb1 += cg(j1,mb1,j2,mb2,j) *
-  //                  (dudr(j1,ma1,mb1) * u(j2,ma2,mb2) +
-  //                  u(j1,ma1,mb1) * dudr(j2,ma2,mb2))
-  //              dzdr += sumb1*cg(j1,ma1,j2,ma2,j)
-  //            dbdr(j1,j2,j) +=
-  //              Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb) +
-  //              Conj(u(j,ma,mb))*dzdr
-
-  double* dbdr;
-  double* dudr_r, *dudr_i;
-  double sumb1_r[3], sumb1_i[3], dzdr_r[3], dzdr_i[3];
-  int ma2;
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
-
-  for(int JJ = 0; JJ < idxj_max; JJ++) {
-    const int j1 = idxj[JJ].j1;
-    const int j2 = idxj[JJ].j2;
-    const int j = idxj[JJ].j;
-
-    dbdr = dbarray[j1][j2][j];
-    dbdr[0] = 0.0;
-    dbdr[1] = 0.0;
-    dbdr[2] = 0.0;
-
-    double** *j1duarray_r = duarray_r[j1];
-    double** *j2duarray_r = duarray_r[j2];
-    double** *j1duarray_i = duarray_i[j1];
-    double** *j2duarray_i = duarray_i[j2];
-    double** j1uarraytot_r = uarraytot_r[j1];
-    double** j2uarraytot_r = uarraytot_r[j2];
-    double** j1uarraytot_i = uarraytot_i[j1];
-    double** j2uarraytot_i = uarraytot_i[j2];
-    double** j1j2jcgarray = cgarray[j1][j2][j];
-
-    for(int ma = 0; ma <= j; ma++)
-      for(int mb = 0; mb <= j; mb++) {
-        dzdr_r[0] = 0.0;
-        dzdr_r[1] = 0.0;
-        dzdr_r[2] = 0.0;
-        dzdr_i[0] = 0.0;
-        dzdr_i[1] = 0.0;
-        dzdr_i[2] = 0.0;
-
-        const int max_mb1 = MIN(j1, (2 * mb - j + j2 + j1) / 2) + 1;
-        const int max_ma1 = MIN(j1, (2 * ma - j + j2 + j1) / 2) + 1;
-
-        for(int ma1 = MAX(0, (2 * ma - j - j2 + j1) / 2);
-            ma1 < max_ma1; ma1++) {
-
-          ma2 = (2 * ma - j - (2 * ma1 - j1) + j2) / 2;
-          sumb1_r[0] = 0.0;
-          sumb1_r[1] = 0.0;
-          sumb1_r[2] = 0.0;
-          sumb1_i[0] = 0.0;
-          sumb1_i[1] = 0.0;
-          sumb1_i[2] = 0.0;
-
-          //inside loop 54 operations (mul and add)
-          for(int mb1 = MAX(0, (2 * mb - j - j2 + j1) / 2),
-              mb2 = mb + (j1 + j2 - j) / 2 - mb1;
-              mb1 < max_mb1; mb1++, mb2--) {
-
-            double* dudr1_r, *dudr1_i, *dudr2_r, *dudr2_i;
-
-            dudr1_r = j1duarray_r[ma1][mb1];
-            dudr2_r = j2duarray_r[ma2][mb2];
-            dudr1_i = j1duarray_i[ma1][mb1];
-            dudr2_i = j2duarray_i[ma2][mb2];
-
-            const double cga_mb1mb2 = j1j2jcgarray[mb1][mb2];
-            const double uat_r_ma2mb2 = cga_mb1mb2 * j2uarraytot_r[ma2][mb2];
-            const double uat_r_ma1mb1 = cga_mb1mb2 * j1uarraytot_r[ma1][mb1];
-            const double uat_i_ma2mb2 = cga_mb1mb2 * j2uarraytot_i[ma2][mb2];
-            const double uat_i_ma1mb1 = cga_mb1mb2 * j1uarraytot_i[ma1][mb1];
-
-            for(int k = 0; k < 3; k++) {
-              sumb1_r[k] += dudr1_r[k] * uat_r_ma2mb2;
-              sumb1_r[k] -= dudr1_i[k] * uat_i_ma2mb2;
-              sumb1_i[k] += dudr1_r[k] * uat_i_ma2mb2;
-              sumb1_i[k] += dudr1_i[k] * uat_r_ma2mb2;
-
-              sumb1_r[k] += dudr2_r[k] * uat_r_ma1mb1;
-              sumb1_r[k] -= dudr2_i[k] * uat_i_ma1mb1;
-              sumb1_i[k] += dudr2_r[k] * uat_i_ma1mb1;
-              sumb1_i[k] += dudr2_i[k] * uat_r_ma1mb1;
-            }
-          } // end loop over mb1,mb2
-
-          // dzdr += sumb1*cg(j1,ma1,j2,ma2,j)
-
-          dzdr_r[0] += sumb1_r[0] * j1j2jcgarray[ma1][ma2];
-          dzdr_r[1] += sumb1_r[1] * j1j2jcgarray[ma1][ma2];
-          dzdr_r[2] += sumb1_r[2] * j1j2jcgarray[ma1][ma2];
-          dzdr_i[0] += sumb1_i[0] * j1j2jcgarray[ma1][ma2];
-          dzdr_i[1] += sumb1_i[1] * j1j2jcgarray[ma1][ma2];
-          dzdr_i[2] += sumb1_i[2] * j1j2jcgarray[ma1][ma2];
-        } // end loop over ma1,ma2
-
-        // dbdr(j1,j2,j) +=
-        //   Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb) +
-        //   Conj(u(j,ma,mb))*dzdr
-
-        dudr_r = duarray_r[j][ma][mb];
-        dudr_i = duarray_i[j][ma][mb];
-
-        for(int k = 0; k < 3; k++)
-          dbdr[k] +=
-            (dudr_r[k] * zarray_r[j1][j2][j][ma][mb] +
-             dudr_i[k] * zarray_i[j1][j2][j][ma][mb]) +
-            (uarraytot_r[j][ma][mb] * dzdr_r[k] +
-             uarraytot_i[j][ma][mb] * dzdr_i[k]);
-      } //end loop over ma mb
-
-  } //end loop over j1 j2 j
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[4] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
-
 }
 
 /* ----------------------------------------------------------------------
@@ -958,14 +699,10 @@ void SNA::compute_dbidrj()
   double jjjmambzarray_r;
   double jjjmambzarray_i;
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
-
-  for(int JJ = 0; JJ < idxj_max; JJ++) {
-    const int j1 = idxj[JJ].j1;
-    const int j2 = idxj[JJ].j2;
-    const int j = idxj[JJ].j;
+  for(int jjb = 0; jjb < idxb_max; jjb++) {
+    const int j1 = idxb[jjb].j1;
+    const int j2 = idxb[jjb].j2;
+    const int j = idxb[jjb].j;
 
     dbdr = dbarray[j1][j2][j];
     dbdr[0] = 0.0;
@@ -1149,12 +886,6 @@ void SNA::compute_dbidrj()
 
   } //end loop over j1 j2 j
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[4] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
-
 }
 
 /* ----------------------------------------------------------------------
@@ -1251,27 +982,6 @@ void SNA::add_uarraytot(double r, double wj, double rcut)
       }
 }
 
-void SNA::add_uarraytot_omp(double r, double wj, double rcut)
-{
-  double sfac;
-
-  sfac = compute_sfac(r, rcut);
-
-  sfac *= wj;
-
-#if defined(_OPENMP)
-#pragma omp for
-#endif
-  for (int j = 0; j <= twojmax; j++)
-    for (int ma = 0; ma <= j; ma++)
-      for (int mb = 0; mb <= j; mb++) {
-        uarraytot_r[j][ma][mb] +=
-          sfac * uarray_r[j][ma][mb];
-        uarraytot_i[j][ma][mb] +=
-          sfac * uarray_i[j][ma][mb];
-      }
-}
-
 /* ----------------------------------------------------------------------
    compute Wigner U-functions for one neighbor
 ------------------------------------------------------------------------- */
@@ -1348,88 +1058,6 @@ void SNA::compute_uarray(double x, double y, double z,
   }
 }
 
-void SNA::compute_uarray_omp(double x, double y, double z,
-                             double z0, double r, int /*sub_threads*/)
-{
-  double r0inv;
-  double a_r, b_r, a_i, b_i;
-  double rootpq;
-
-  // compute Cayley-Klein parameters for unit quaternion
-
-  r0inv = 1.0 / sqrt(r * r + z0 * z0);
-  a_r = r0inv * z0;
-  a_i = -r0inv * z;
-  b_r = r0inv * y;
-  b_i = -r0inv * x;
-
-  // VMK Section 4.8.2
-
-  uarray_r[0][0][0] = 1.0;
-  uarray_i[0][0][0] = 0.0;
-
-  for (int j = 1; j <= twojmax; j++) {
-#if defined(_OPENMP)
-#pragma omp for
-#endif
-    for (int mb = 0; mb < j; mb++) {
-      uarray_r[j][0][mb] = 0.0;
-      uarray_i[j][0][mb] = 0.0;
-
-      for (int ma = 0; ma < j; ma++) {
-        rootpq = rootpqarray[j - ma][j - mb];
-        uarray_r[j][ma][mb] +=
-          rootpq *
-          (a_r * uarray_r[j - 1][ma][mb] +
-           a_i * uarray_i[j - 1][ma][mb]);
-        uarray_i[j][ma][mb] +=
-          rootpq *
-          (a_r * uarray_i[j - 1][ma][mb] -
-           a_i * uarray_r[j - 1][ma][mb]);
-
-        rootpq = rootpqarray[ma + 1][j - mb];
-        uarray_r[j][ma + 1][mb] =
-          -rootpq *
-          (b_r * uarray_r[j - 1][ma][mb] +
-           b_i * uarray_i[j - 1][ma][mb]);
-        uarray_i[j][ma + 1][mb] =
-          -rootpq *
-          (b_r * uarray_i[j - 1][ma][mb] -
-           b_i * uarray_r[j - 1][ma][mb]);
-      }
-    }
-
-    int mb = j;
-    uarray_r[j][0][mb] = 0.0;
-    uarray_i[j][0][mb] = 0.0;
-
-#if defined(_OPENMP)
-#pragma omp for
-#endif
-    for (int ma = 0; ma < j; ma++) {
-      rootpq = rootpqarray[j - ma][mb];
-      uarray_r[j][ma][mb] +=
-        rootpq *
-        (b_r * uarray_r[j - 1][ma][mb - 1] -
-         b_i * uarray_i[j - 1][ma][mb - 1]);
-      uarray_i[j][ma][mb] +=
-        rootpq *
-        (b_r * uarray_i[j - 1][ma][mb - 1] +
-         b_i * uarray_r[j - 1][ma][mb - 1]);
-
-      rootpq = rootpqarray[ma + 1][mb];
-      uarray_r[j][ma + 1][mb] =
-        rootpq *
-        (a_r * uarray_r[j - 1][ma][mb - 1] -
-         a_i * uarray_i[j - 1][ma][mb - 1]);
-      uarray_i[j][ma + 1][mb] =
-        rootpq *
-        (a_r * uarray_i[j - 1][ma][mb - 1] +
-         a_i * uarray_r[j - 1][ma][mb - 1]);
-    }
-  }
-}
-
 /* ----------------------------------------------------------------------
    compute derivatives of Wigner U-functions for one neighbor
    see comments in compute_uarray()
@@ -1644,20 +1272,18 @@ void SNA::create_twojmax_arrays()
     bzero = NULL;
 
 
-  if(!use_shared_arrays) {
-    memory->create(uarraytot_r, jdim, jdim, jdim,
-                   "sna:uarraytot");
-    memory->create(zarray_r, jdim, jdim, jdim, jdim, jdim,
-                   "sna:zarray");
-    memory->create(uarraytot_i, jdim, jdim, jdim,
-                   "sna:uarraytot");
-    memory->create(zarray_i, jdim, jdim, jdim, jdim, jdim,
-                   "sna:zarray");
-    memory->create(yarray_r, jdim, jdim, jdim,
-                   "sna:yarray");
-    memory->create(yarray_i, jdim, jdim, jdim,
-                   "sna:yarray");
-  }
+  memory->create(uarraytot_r, jdim, jdim, jdim,
+                 "sna:uarraytot");
+  memory->create(zarray_r, jdim, jdim, jdim, jdim, jdim,
+                 "sna:zarray");
+  memory->create(uarraytot_i, jdim, jdim, jdim,
+                 "sna:uarraytot");
+  memory->create(zarray_i, jdim, jdim, jdim, jdim, jdim,
+                 "sna:zarray");
+  memory->create(yarray_r, jdim, jdim, jdim,
+                 "sna:yarray");
+  memory->create(yarray_i, jdim, jdim, jdim,
+                 "sna:yarray");
 
 }
 
@@ -1680,14 +1306,12 @@ void SNA::destroy_twojmax_arrays()
   if (bzero_flag)
     memory->destroy(bzero);
 
-  if(!use_shared_arrays) {
-    memory->destroy(uarraytot_r);
-    memory->destroy(zarray_r);
-    memory->destroy(uarraytot_i);
-    memory->destroy(zarray_i);
-    memory->destroy(yarray_r);
-    memory->destroy(yarray_i);
-  }
+  memory->destroy(uarraytot_r);
+  memory->destroy(zarray_r);
+  memory->destroy(uarraytot_i);
+  memory->destroy(zarray_i);
+  memory->destroy(yarray_r);
+  memory->destroy(yarray_i);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/SNAP/sna.h b/src/SNAP/sna.h
index 2c90da1d30..b93b0ac7b0 100644
--- a/src/SNAP/sna.h
+++ b/src/SNAP/sna.h
@@ -24,14 +24,19 @@
 
 namespace LAMMPS_NS {
 
-struct SNA_LOOPINDICES {
+struct SNA_ZINDICES {
+  int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju;
+  double betaj;
+};
+
+struct SNA_BINDICES {
   int j1, j2, j;
 };
 
 class SNA : protected Pointers {
 
 public:
-  SNA(LAMMPS*, double, int, int, int, double, int, int);
+  SNA(LAMMPS*, double, int, int, double, int, int);
 
   SNA(LAMMPS* lmp) : Pointers(lmp) {};
   ~SNA();
@@ -44,10 +49,8 @@ public:
   // functions for bispectrum coefficients
 
   void compute_ui(int);
-  void compute_ui_omp(int, int);
   void compute_zi();
-  void compute_zi_omp(int);
-  void compute_yi(double*);
+  void compute_yi(const double*);
   void compute_bi();
   void copy_bi2bvec();
 
@@ -56,20 +59,10 @@ public:
   void compute_duidrj(double*, double, double);
   void compute_dbidrj();
   void compute_deidrj(double*);
-  void compute_dbidrj_nonsymm();
   void copy_dbi2dbvec();
   double compute_sfac(double, double);
   double compute_dsfac(double, double);
 
-#ifdef TIMING_INFO
-  double* timers;
-  timespec starttime, endtime;
-  int print;
-  int counter;
-#endif
-
-  //per sna class instance for OMP use
-
   double* bvec, ** dbvec;
   double** rij;
   int* inside;
@@ -83,16 +76,17 @@ public:
   double*** uarraytot_r, *** uarraytot_i;
   double***** zarray_r, ***** zarray_i;
   double*** yarray_r, *** yarray_i;
-  double*** uarraytot_r_b, *** uarraytot_i_b;
-  double***** zarray_r_b, ***** zarray_i_b;
   double*** uarray_r, *** uarray_i;
 
 private:
   double rmin0, rfac0;
 
-  //use indexlist instead of loops, constructor generates these
-  SNA_LOOPINDICES* idxj;
-  int idxj_max;
+  // use indexlist instead of loops, constructor generates these
+
+  SNA_ZINDICES* idxz;
+  SNA_BINDICES* idxb;
+  int idxcg_max, idxu_max, idxz_max, idxb_max;
+
   // data for bispectrum coefficients
 
   double***** cgarray;
@@ -104,6 +98,21 @@ private:
   double**** duarray_r, **** duarray_i;
   double**** dbarray;
 
+  double* cglist;  
+  int*** idxcg_block; 
+
+  double* ulisttot_r, * ulisttot_i;
+  double* ulist_r, * ulist_i;
+  int* idxu_block;
+
+  double* zlist_r, * zlist_i;
+  int*** idxz_block;
+
+  int*** idxb_block;
+
+  double** dulist_r, ** dulist_i;
+  double* ylist_r, * ylist_i;
+
   static const int nmaxfactorial = 167;
   static const double nfac_table[];
   double factorial(int);
@@ -118,22 +127,13 @@ private:
   void zero_uarraytot();
   void addself_uarraytot(double);
   void add_uarraytot(double, double, double);
-  void add_uarraytot_omp(double, double, double);
   void compute_uarray(double, double, double,
                       double, double);
-  void compute_uarray_omp(double, double, double,
-                          double, double, int);
   double deltacg(int, int, int);
   int compute_ncoeff();
   void compute_duarray(double, double, double,
                        double, double, double, double, double);
 
-  // if number of atoms are small use per atom arrays
-  // for twojmax arrays, rij, inside, bvec
-  // this will increase the memory footprint considerably,
-  // but allows parallel filling and reuse of these arrays
-  int use_shared_arrays;
-
   // Sets the style for the switching function
   // 0 = none
   // 1 = cosine

From 0559e155f23330999239847396384115fc698311 Mon Sep 17 00:00:00 2001
From: Aidan Thompson <athomps@sandia.gov>
Date: Tue, 11 Jun 2019 18:24:02 -0600
Subject: [PATCH 12/21] Implemented lists instead of multidim arrays

---
 src/SNAP/compute_sna_atom.cpp  |    7 +-
 src/SNAP/compute_snad_atom.cpp |   34 +-
 src/SNAP/compute_snav_atom.cpp |   46 +-
 src/SNAP/pair_snap.cpp         |    8 +-
 src/SNAP/sna.cpp               | 1160 ++++++++++++++------------------
 src/SNAP/sna.h                 |   25 +-
 6 files changed, 566 insertions(+), 714 deletions(-)

diff --git a/src/SNAP/compute_sna_atom.cpp b/src/SNAP/compute_sna_atom.cpp
index 17774143d5..fea37faca0 100644
--- a/src/SNAP/compute_sna_atom.cpp
+++ b/src/SNAP/compute_sna_atom.cpp
@@ -247,13 +247,12 @@ void ComputeSNAAtom::compute_peratom()
       snaptr->compute_ui(ninside);
       snaptr->compute_zi();
       snaptr->compute_bi();
-      snaptr->copy_bi2bvec();
       for (int icoeff = 0; icoeff < ncoeff; icoeff++)
-        sna[i][icoeff] = snaptr->bvec[icoeff];
+        sna[i][icoeff] = snaptr->blist[icoeff];
       if (quadraticflag) {
         int ncount = ncoeff;
         for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          double bi = snaptr->bvec[icoeff];
+          double bi = snaptr->blist[icoeff];
 
           // diagonal element of quadratic matrix
 
@@ -262,7 +261,7 @@ void ComputeSNAAtom::compute_peratom()
           // upper-triangular elements of quadratic matrix
 
           for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++)
-            sna[i][ncount++] = bi*snaptr->bvec[jcoeff];
+            sna[i][ncount++] = bi*snaptr->blist[jcoeff];
         }
       }
     } else {
diff --git a/src/SNAP/compute_snad_atom.cpp b/src/SNAP/compute_snad_atom.cpp
index b356d61d3d..156380eccc 100644
--- a/src/SNAP/compute_snad_atom.cpp
+++ b/src/SNAP/compute_snad_atom.cpp
@@ -266,7 +266,6 @@ void ComputeSNADAtom::compute_peratom()
       snaptr->compute_zi();
       if (quadraticflag) {
         snaptr->compute_bi();
-        snaptr->copy_bi2bvec();
       }
 
       for (int jj = 0; jj < ninside; jj++) {
@@ -275,7 +274,6 @@ void ComputeSNADAtom::compute_peratom()
                                     snaptr->wj[jj],
                                     snaptr->rcutij[jj]);
         snaptr->compute_dbidrj();
-        snaptr->copy_dbi2dbvec();
 
         // Accumulate -dBi/dRi, -dBi/dRj
 
@@ -283,12 +281,12 @@ void ComputeSNADAtom::compute_peratom()
         double *snadj = snad[j]+typeoffset;
 
         for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          snadi[icoeff] += snaptr->dbvec[icoeff][0];
-          snadi[icoeff+yoffset] += snaptr->dbvec[icoeff][1];
-          snadi[icoeff+zoffset] += snaptr->dbvec[icoeff][2];
-          snadj[icoeff] -= snaptr->dbvec[icoeff][0];
-          snadj[icoeff+yoffset] -= snaptr->dbvec[icoeff][1];
-          snadj[icoeff+zoffset] -= snaptr->dbvec[icoeff][2];
+          snadi[icoeff] += snaptr->dblist[icoeff][0];
+          snadi[icoeff+yoffset] += snaptr->dblist[icoeff][1];
+          snadi[icoeff+zoffset] += snaptr->dblist[icoeff][2];
+          snadj[icoeff] -= snaptr->dblist[icoeff][0];
+          snadj[icoeff+yoffset] -= snaptr->dblist[icoeff][1];
+          snadj[icoeff+zoffset] -= snaptr->dblist[icoeff][2];
         }
 
         if (quadraticflag) {
@@ -297,10 +295,10 @@ void ComputeSNADAtom::compute_peratom()
           snadj += quadraticoffset;
           int ncount = 0;
           for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-            double bi = snaptr->bvec[icoeff];
-            double bix = snaptr->dbvec[icoeff][0];
-            double biy = snaptr->dbvec[icoeff][1];
-            double biz = snaptr->dbvec[icoeff][2];
+            double bi = snaptr->blist[icoeff];
+            double bix = snaptr->dblist[icoeff][0];
+            double biy = snaptr->dblist[icoeff][1];
+            double biz = snaptr->dblist[icoeff][2];
 
             // diagonal elements of quadratic matrix
 
@@ -319,12 +317,12 @@ void ComputeSNADAtom::compute_peratom()
             // upper-triangular elements of quadratic matrix
 
             for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-              double dbxtmp = bi*snaptr->dbvec[jcoeff][0]
-                + bix*snaptr->bvec[jcoeff];
-              double dbytmp = bi*snaptr->dbvec[jcoeff][1]
-                + biy*snaptr->bvec[jcoeff];
-              double dbztmp = bi*snaptr->dbvec[jcoeff][2]
-                + biz*snaptr->bvec[jcoeff];
+              double dbxtmp = bi*snaptr->dblist[jcoeff][0]
+                + bix*snaptr->blist[jcoeff];
+              double dbytmp = bi*snaptr->dblist[jcoeff][1]
+                + biy*snaptr->blist[jcoeff];
+              double dbztmp = bi*snaptr->dblist[jcoeff][2]
+                + biz*snaptr->blist[jcoeff];
 
               snadi[ncount] +=         dbxtmp;
               snadi[ncount+yoffset] += dbytmp;
diff --git a/src/SNAP/compute_snav_atom.cpp b/src/SNAP/compute_snav_atom.cpp
index 9f9ef7a67d..6caff0820c 100644
--- a/src/SNAP/compute_snav_atom.cpp
+++ b/src/SNAP/compute_snav_atom.cpp
@@ -260,7 +260,6 @@ void ComputeSNAVAtom::compute_peratom()
       snaptr->compute_zi();
       if (quadraticflag) {
         snaptr->compute_bi();
-        snaptr->copy_bi2bvec();
       }
 
       for (int jj = 0; jj < ninside; jj++) {
@@ -270,7 +269,6 @@ void ComputeSNAVAtom::compute_peratom()
                                     snaptr->wj[jj],
                                     snaptr->rcutij[jj]);
         snaptr->compute_dbidrj();
-        snaptr->copy_dbi2dbvec();
 
         // Accumulate -dBi/dRi*Ri, -dBi/dRj*Rj
 
@@ -278,18 +276,18 @@ void ComputeSNAVAtom::compute_peratom()
         double *snavj = snav[j]+typeoffset;
 
         for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          snavi[icoeff]           += snaptr->dbvec[icoeff][0]*xtmp;
-          snavi[icoeff+nperdim]   += snaptr->dbvec[icoeff][1]*ytmp;
-          snavi[icoeff+2*nperdim] += snaptr->dbvec[icoeff][2]*ztmp;
-          snavi[icoeff+3*nperdim] += snaptr->dbvec[icoeff][1]*ztmp;
-          snavi[icoeff+4*nperdim] += snaptr->dbvec[icoeff][0]*ztmp;
-          snavi[icoeff+5*nperdim] += snaptr->dbvec[icoeff][0]*ytmp;
-          snavj[icoeff]           -= snaptr->dbvec[icoeff][0]*x[j][0];
-          snavj[icoeff+nperdim]   -= snaptr->dbvec[icoeff][1]*x[j][1];
-          snavj[icoeff+2*nperdim] -= snaptr->dbvec[icoeff][2]*x[j][2];
-          snavj[icoeff+3*nperdim] -= snaptr->dbvec[icoeff][1]*x[j][2];
-          snavj[icoeff+4*nperdim] -= snaptr->dbvec[icoeff][0]*x[j][2];
-          snavj[icoeff+5*nperdim] -= snaptr->dbvec[icoeff][0]*x[j][1];
+          snavi[icoeff]           += snaptr->dblist[icoeff][0]*xtmp;
+          snavi[icoeff+nperdim]   += snaptr->dblist[icoeff][1]*ytmp;
+          snavi[icoeff+2*nperdim] += snaptr->dblist[icoeff][2]*ztmp;
+          snavi[icoeff+3*nperdim] += snaptr->dblist[icoeff][1]*ztmp;
+          snavi[icoeff+4*nperdim] += snaptr->dblist[icoeff][0]*ztmp;
+          snavi[icoeff+5*nperdim] += snaptr->dblist[icoeff][0]*ytmp;
+          snavj[icoeff]           -= snaptr->dblist[icoeff][0]*x[j][0];
+          snavj[icoeff+nperdim]   -= snaptr->dblist[icoeff][1]*x[j][1];
+          snavj[icoeff+2*nperdim] -= snaptr->dblist[icoeff][2]*x[j][2];
+          snavj[icoeff+3*nperdim] -= snaptr->dblist[icoeff][1]*x[j][2];
+          snavj[icoeff+4*nperdim] -= snaptr->dblist[icoeff][0]*x[j][2];
+          snavj[icoeff+5*nperdim] -= snaptr->dblist[icoeff][0]*x[j][1];
         }
 
         if (quadraticflag) {
@@ -298,10 +296,10 @@ void ComputeSNAVAtom::compute_peratom()
           snavj += quadraticoffset;
           int ncount = 0;
           for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-            double bi = snaptr->bvec[icoeff];
-            double bix = snaptr->dbvec[icoeff][0];
-            double biy = snaptr->dbvec[icoeff][1];
-            double biz = snaptr->dbvec[icoeff][2];
+            double bi = snaptr->blist[icoeff];
+            double bix = snaptr->dblist[icoeff][0];
+            double biy = snaptr->dblist[icoeff][1];
+            double biz = snaptr->dblist[icoeff][2];
 
             // diagonal element of quadratic matrix
 
@@ -325,12 +323,12 @@ void ComputeSNAVAtom::compute_peratom()
             // upper-triangular elements of quadratic matrix
 
             for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-              double dbxtmp = bi*snaptr->dbvec[jcoeff][0]
-                + bix*snaptr->bvec[jcoeff];
-              double dbytmp = bi*snaptr->dbvec[jcoeff][1]
-                + biy*snaptr->bvec[jcoeff];
-              double dbztmp = bi*snaptr->dbvec[jcoeff][2]
-                + biz*snaptr->bvec[jcoeff];
+              double dbxtmp = bi*snaptr->dblist[jcoeff][0]
+                + bix*snaptr->blist[jcoeff];
+              double dbytmp = bi*snaptr->dblist[jcoeff][1]
+                + biy*snaptr->blist[jcoeff];
+              double dbztmp = bi*snaptr->dblist[jcoeff][2]
+                + biz*snaptr->blist[jcoeff];
               snavi[ncount] +=           dbxtmp*xtmp;
               snavi[ncount+nperdim] +=   dbytmp*ytmp;
               snavi[ncount+2*nperdim] += dbztmp*ztmp;
diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp
index ae542e81b4..6a65f872fd 100644
--- a/src/SNAP/pair_snap.cpp
+++ b/src/SNAP/pair_snap.cpp
@@ -166,17 +166,14 @@ void PairSNAP::compute(int eflag, int vflag)
       }
     }
 
-    // compute Ui, Zi, and Bi for atom I
+    // compute Ui, Yi for atom I
 
     snaptr->compute_ui(ninside);
-    snaptr->compute_zi();
 
     // for neighbors of I within cutoff:
     // compute Fij = dEi/dRj = -dEi/dRi 
     // add to Fi, subtract from Fj
 
-    // compute beta_i*Z_i = Y_i
-
     snaptr->compute_yi(beta[ii]);
 
     for (int jj = 0; jj < ninside; jj++) {
@@ -310,10 +307,9 @@ void PairSNAP::compute_bispectrum()
     snaptr->compute_ui(ninside);
     snaptr->compute_zi();
     snaptr->compute_bi();
-    snaptr->copy_bi2bvec();
 
     for (int k = 0; k < ncoeff; k++)
-      bispectrum[ii][k] = snaptr->bvec[k];
+      bispectrum[ii][k] = snaptr->blist[k];
   }
 }
 
diff --git a/src/SNAP/sna.cpp b/src/SNAP/sna.cpp
index b388136caf..131ac48fdb 100644
--- a/src/SNAP/sna.cpp
+++ b/src/SNAP/sna.cpp
@@ -128,8 +128,6 @@ SNA::SNA(LAMMPS* lmp, double rfac0_in,
 
   ncoeff = compute_ncoeff();
 
-  create_twojmax_arrays();
-
   bvec = NULL;
   dbvec = NULL;
   memory->create(bvec, ncoeff, "pair:bvec");
@@ -142,20 +140,21 @@ SNA::SNA(LAMMPS* lmp, double rfac0_in,
   idxz = NULL;
   idxb= NULL;
 
+  build_indexlist();
+  create_twojmax_arrays();
+
   if (bzero_flag) {
     double www = wself*wself*wself;
     for(int j = 0; j <= twojmax; j++)
       bzero[j] = www*(j+1);
   }
 
-  build_indexlist();
 }
 
 /* ---------------------------------------------------------------------- */
 
 SNA::~SNA()
 {
-  destroy_twojmax_arrays();
   memory->destroy(rij);
   memory->destroy(inside);
   memory->destroy(wj);
@@ -164,6 +163,7 @@ SNA::~SNA()
   memory->destroy(dbvec);
   delete[] idxz;
   delete[] idxb;
+  destroy_twojmax_arrays();
 }
 
 void SNA::build_indexlist()
@@ -234,9 +234,10 @@ void SNA::build_indexlist()
   for(int j1 = 0; j1 <= twojmax; j1++)
     for(int j2 = 0; j2 <= j1; j2++)
       for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
-        if (j < j1) continue;
-        idxb_block[j1][j2][j] = idxb_count; 
-        idxb_count++;
+        if (j >= j1) {
+          idxb_block[j1][j2][j] = idxb_count; 
+          idxb_count++;
+        }
       }
 
   // index list for zlist
@@ -266,38 +267,6 @@ void SNA::build_indexlist()
         // multiply and divide by j+1 factors
         // account for multiplicity of 1, 2, or 3
 
-        // CODE HORROR!! Need to figure this out later
-        double betaj = 1.0;
-// #ifdef USE_YDIRECT_ZLIST
-//         double betaj; 
-//         if (j >= j1) {
-//           const int jjb = idxb_block[j1][j2][j];
-//           if (j1 == j) {
-//             if (j2 == j) betaj = 3*beta[jjb];
-//             else betaj = 2*beta[jjb];
-//           } else betaj = beta[jjb]; 
-//         } else if (j >= j2) {
-//           const int jjb = idxb_block[j][j2][j1];
-//           if (j2 == j) betaj = 2*beta[jjb]*(j1+1)/(j+1.0);
-//           else betaj = beta[jjb]*(j1+1)/(j+1.0);
-//         } else {
-//           const int jjb = idxb_block[j2][j][j1];
-//           betaj = beta[jjb]*(j1+1)/(j+1.0); 
-//         }
-// #else
-//         double betaj; 
-//         if (j >= j1) {
-//           const int jjb = idxb_block[j1][j2][j];
-//           betaj = beta[jjb]; 
-//         } else if (j >= j2) {
-//           const int jjb = idxb_block[j][j2][j1];
-//           betaj = beta[jjb]*(j1+1)/(j+1.0);
-//         } else {
-//           const int jjb = idxb_block[j2][j][j1];
-//           betaj = beta[jjb]*(j1+1)/(j+1.0); 
-//         }
-// #endif
-
         for (int mb = 0; 2*mb <= j; mb++)
           for (int ma = 0; ma <= j; ma++) {
             idxz[idxz_count].j1 = j1;
@@ -311,11 +280,9 @@ void SNA::build_indexlist()
             idxz[idxz_count].nb = MIN(j1, (2 * mb - j + j2 + j1) / 2) - idxz[idxz_count].mb1min + 1;
 
             // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
-            // find right beta[jjb] and y_list[jju] entries
 
             const int jju = idxu_block[j] + (j+1)*mb + ma;
             idxz[idxz_count].jju = jju;
-            idxz[idxz_count].betaj = betaj;
 
             idxz_count++;
           }
@@ -386,105 +353,168 @@ void SNA::compute_ui(int jnum)
 void SNA::compute_zi()
 {
 
-  // compute_dbidrj() requires full j1/j2/j chunk of z elements
-  // use zarray j1/j2 symmetry
+  int ma2, mb2;
+  for(int jjz = 0; jjz < idxz_max; jjz++) {
+    const int j1 = idxz[jjz].j1;
+    const int j2 = idxz[jjz].j2;
+    const int j = idxz[jjz].j;
+    const int ma1min = idxz[jjz].ma1min;
+    const int ma2max = idxz[jjz].ma2max;
+    const int na = idxz[jjz].na;
+    const int mb1min = idxz[jjz].mb1min;
+    const int mb2max = idxz[jjz].mb2max;
+    const int nb = idxz[jjz].nb;
 
-  for(int j1 = 0; j1 <= twojmax; j1++)
-    for(int j2 = 0; j2 <= j1; j2++) {
-      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
-        double sumb1_r, sumb1_i;
-        int ma2, mb2;
-        for(int mb = 0; 2*mb <= j; mb++)
-          for(int ma = 0; ma <= j; ma++) {
-            zarray_r[j1][j2][j][ma][mb] = 0.0;
-            zarray_i[j1][j2][j][ma][mb] = 0.0;
+    const double* cgblock = cglist + idxcg_block[j1][j2][j];
 
-            for(int ma1 = MAX(0, (2 * ma - j - j2 + j1) / 2);
-                ma1 <= MIN(j1, (2 * ma - j + j2 + j1) / 2); ma1++) {
-              sumb1_r = 0.0;
-              sumb1_i = 0.0;
+    zlist_r[jjz] = 0.0;
+    zlist_i[jjz] = 0.0;
 
-              ma2 = (2 * ma - j - (2 * ma1 - j1) + j2) / 2;
+    int jju1 = idxu_block[j1] + (j1+1)*mb1min;
+    int jju2 = idxu_block[j2] + (j2+1)*mb2max;
+    int icgb = mb1min*(j2+1) + mb2max;
+    for(int ib = 0; ib < nb; ib++) {
 
-              for(int mb1 = MAX(0, (2 * mb - j - j2 + j1) / 2);
-              mb1 <= MIN(j1, (2 * mb - j + j2 + j1) / 2); mb1++) {
+      double suma1_r = 0.0;
+      double suma1_i = 0.0;
 
-                mb2 = (2 * mb - j - (2 * mb1 - j1) + j2) / 2;
-                sumb1_r += cgarray[j1][j2][j][mb1][mb2] *
-                  (uarraytot_r[j1][ma1][mb1] * uarraytot_r[j2][ma2][mb2] -
-                   uarraytot_i[j1][ma1][mb1] * uarraytot_i[j2][ma2][mb2]);
-                sumb1_i += cgarray[j1][j2][j][mb1][mb2] *
-                  (uarraytot_r[j1][ma1][mb1] * uarraytot_i[j2][ma2][mb2] +
-                   uarraytot_i[j1][ma1][mb1] * uarraytot_r[j2][ma2][mb2]);
-              } // end loop over mb1
+      const double* u1_r = &ulisttot_r[jju1];
+      const double* u1_i = &ulisttot_i[jju1];
+      const double* u2_r = &ulisttot_r[jju2];
+      const double* u2_i = &ulisttot_i[jju2];
 
-              zarray_r[j1][j2][j][ma][mb] +=
-                sumb1_r * cgarray[j1][j2][j][ma1][ma2];
-              zarray_i[j1][j2][j][ma][mb] +=
-                sumb1_i * cgarray[j1][j2][j][ma1][ma2];
-            } // end loop over ma1
-          } // end loop over ma, mb
-      } // end loop over j
-    } // end loop over j1, j2
+      int ma1 = ma1min;
+      int ma2 = ma2max;
+      int icga = ma1min*(j2+1) + ma2max;
+      for(int ia = 0; ia < na; ia++) {
+        suma1_r += cgblock[icga] * (u1_r[ma1] * u2_r[ma2] - u1_i[ma1] * u2_i[ma2]);
+        suma1_i += cgblock[icga] * (u1_r[ma1] * u2_i[ma2] + u1_i[ma1] * u2_r[ma2]);
+        ma1++;
+        ma2--;
+        icga += j2;
+      } // end loop over ia
 
+      zlist_r[jjz] += cgblock[icgb] * suma1_r;
+      zlist_i[jjz] += cgblock[icgb] * suma1_i;
+      jju1 += j1+1;
+      jju2 -= j2+1;
+      icgb += j2;
+    } // end loop over ib
+
+//     // apply symmetry factor
+ 
+//     const double jfac = 1.0/(j+1);
+//     zlist_r[jjz] *= jfac;
+//     zlist_i[jjz] *= jfac;
+    
+  } // end loop over jjz
 }
 
 /* ----------------------------------------------------------------------
-   compute Yi by summing over products of beta and Zi
+   compute Yi from Ui without storing Zi, looping over ylist
 ------------------------------------------------------------------------- */
 
 void SNA::compute_yi(const double* beta)
 {
-  int j;
-  int idxz_count;
-  double **jjjzarray_r, **jjjzarray_i;
-
   for(int j = 0; j <= twojmax; j++) {
+    int jju = idxu_block[j];
     for(int mb = 0; 2*mb <= j; mb++)
       for(int ma = 0; ma <= j; ma++) {
-        yarray_r[j][ma][mb] = 0.0;
-        yarray_i[j][ma][mb] = 0.0;
+        ylist_r[jju] = 0.0;
+        ylist_i[jju] = 0.0;
+        jju++;
       } // end loop over ma, mb
   } // end loop over j
 
   for(int jjb = 0; jjb < idxb_max; jjb++) {
-    const int j1 = idxb[jjb].j1;
-    const int j2 = idxb[jjb].j2;
-    const int j3 = idxb[jjb].j;
+    const int j1b = idxb[jjb].j1;
+    const int j2b = idxb[jjb].j2;
+    const int j3b = idxb[jjb].j;
 
-    j = j3;
-    jjjzarray_r = zarray_r[j1][j2][j3];
-    jjjzarray_i = zarray_i[j1][j2][j3];
-    for(int mb = 0; 2*mb <= j; mb++)
-      for(int ma = 0; ma <= j; ma++) {
-        yarray_r[j][ma][mb] += beta[jjb]*jjjzarray_r[ma][mb];
-        yarray_i[j][ma][mb] += beta[jjb]*jjjzarray_i[ma][mb];
-      } // end loop over ma, mb
-
-    j = j1;
-    jjjzarray_r = zarray_r[j3][j2][j1];
-    jjjzarray_i = zarray_i[j3][j2][j1];
-    double j1fac = (j3+1)/(j+1.0);
-    for(int mb = 0; 2*mb <= j; mb++)
-      for(int ma = 0; ma <= j; ma++) {
-        yarray_r[j][ma][mb] += beta[jjb]*jjjzarray_r[ma][mb]*j1fac;
-        yarray_i[j][ma][mb] += beta[jjb]*jjjzarray_i[ma][mb]*j1fac;
-      } // end loop over ma, mb
-
-    j = j2;
-    jjjzarray_r = zarray_r[j3][j1][j2];
-    jjjzarray_i = zarray_i[j3][j1][j2];
-    double j2fac = (j3+1)/(j+1.0);
-    for(int mb = 0; 2*mb <= j; mb++)
-      for(int ma = 0; ma <= j; ma++) {
-        yarray_r[j][ma][mb] += beta[jjb]*jjjzarray_r[ma][mb]*j2fac;
-        yarray_i[j][ma][mb] += beta[jjb]*jjjzarray_i[ma][mb]*j2fac;
-      } // end loop over ma, mb
+    compute_yterm(j1b,j2b,j3b,beta);
+    compute_yterm(j3b,j2b,j1b,beta);
+    compute_yterm(j3b,j1b,j2b,beta);
 
   } // end loop over jjb
 
 }
 
+void SNA::compute_yterm(int j1, int j2, int j, const double* beta) {
+  double betaj;
+
+  int jju = idxu_block[j];
+  int jjz = idxz_block[j1][j2][j];
+
+  // pick out right beta value
+
+  if (j >= j1) {
+    const int jjb = idxb_block[j1][j2][j];
+    betaj = beta[jjb];
+  } else if (j >= j2) {
+    const int jjb = idxb_block[j][j2][j1];
+    betaj = beta[jjb]*(j1+1)/(j+1.0);
+  } else {
+    const int jjb = idxb_block[j2][j][j1];
+    betaj = beta[jjb]*(j1+1)/(j+1.0);
+  }
+
+  // can replace this with a single loop over jjz
+
+  for (int mb = 0; 2*mb <= j; mb++)
+    for (int ma = 0; ma <= j; ma++) {
+
+      const int ma1min = idxz[jjz].ma1min;
+      const int ma2max = idxz[jjz].ma2max;
+      const int na = idxz[jjz].na;
+      const int mb1min = idxz[jjz].mb1min;
+      const int mb2max = idxz[jjz].mb2max;
+      const int nb = idxz[jjz].nb;
+      
+      const double* cgblock = cglist + idxcg_block[j1][j2][j];
+
+      double ztmp_r = 0.0;
+      double ztmp_i = 0.0;
+
+      int jju1 = idxu_block[j1] + (j1+1)*mb1min;
+      int jju2 = idxu_block[j2] + (j2+1)*mb2max;
+      int icgb = mb1min*(j2+1) + mb2max;
+      for(int ib = 0; ib < nb; ib++) {
+
+        double suma1_r = 0.0;
+        double suma1_i = 0.0;
+
+        const double* u1_r = &ulisttot_r[jju1];
+        const double* u1_i = &ulisttot_i[jju1];
+        const double* u2_r = &ulisttot_r[jju2];
+        const double* u2_i = &ulisttot_i[jju2];
+
+        int ma1 = ma1min;
+        int ma2 = ma2max;
+        int icga = ma1min*(j2+1) + ma2max;
+          
+        for(int ia = 0; ia < na; ia++) {
+          suma1_r += cgblock[icga] * (u1_r[ma1] * u2_r[ma2] - u1_i[ma1] * u2_i[ma2]);
+          suma1_i += cgblock[icga] * (u1_r[ma1] * u2_i[ma2] + u1_i[ma1] * u2_r[ma2]);
+          ma1++;
+          ma2--;
+          icga += j2;
+        } // end loop over ia
+
+        ztmp_r += cgblock[icgb] * suma1_r;
+        ztmp_i += cgblock[icgb] * suma1_i;
+        jju1 += j1+1;
+        jju2 -= j2+1;
+        icgb += j2;
+      } // end loop over ib
+
+      // printf("jju betaj ztmp ylist %d %g %g %d %d %d %d %d\n",jju,betaj,ztmp_r,j1,j2,j,ma,mb);
+      ylist_r[jju] += betaj*ztmp_r;
+      ylist_i[jju] += betaj*ztmp_i;
+      jjz++;
+      jju++;
+    } // end loop over ma, mb
+}
+
 /* ----------------------------------------------------------------------
    compute dEidRj
 ------------------------------------------------------------------------- */
@@ -496,19 +526,21 @@ void SNA::compute_deidrj(double* dedr)
     dedr[k] = 0.0;
 
   for(int j = 0; j <= twojmax; j++) {
+    int jju = idxu_block[j];
 
     for(int mb = 0; 2*mb < j; mb++)
       for(int ma = 0; ma <= j; ma++) {
 
-        double* dudr_r = duarray_r[j][ma][mb];
-        double* dudr_i = duarray_i[j][ma][mb];
-        double jjjmambyarray_r = yarray_r[j][ma][mb];
-        double jjjmambyarray_i = yarray_i[j][ma][mb];
+        double* dudr_r = dulist_r[jju];
+        double* dudr_i = dulist_i[jju];
+        double jjjmambyarray_r = ylist_r[jju];
+        double jjjmambyarray_i = ylist_i[jju];
+
         for(int k = 0; k < 3; k++)
           dedr[k] +=
             dudr_r[k] * jjjmambyarray_r +
             dudr_i[k] * jjjmambyarray_i;
-
+        jju++;
       } //end loop over ma mb
 
     // For j even, handle middle column
@@ -517,30 +549,33 @@ void SNA::compute_deidrj(double* dedr)
 
       int mb = j/2;
       for(int ma = 0; ma < mb; ma++) {
-        double* dudr_r = duarray_r[j][ma][mb];
-        double* dudr_i = duarray_i[j][ma][mb];
-        double jjjmambyarray_r = yarray_r[j][ma][mb];
-        double jjjmambyarray_i = yarray_i[j][ma][mb];
+        double* dudr_r = dulist_r[jju];
+        double* dudr_i = dulist_i[jju];
+        double jjjmambyarray_r = ylist_r[jju];
+        double jjjmambyarray_i = ylist_i[jju];
+
         for(int k = 0; k < 3; k++)
           dedr[k] +=
             dudr_r[k] * jjjmambyarray_r +
             dudr_i[k] * jjjmambyarray_i;
-
+        jju++;
       }
 
       int ma = mb;
-      double* dudr_r = duarray_r[j][ma][mb];
-      double* dudr_i = duarray_i[j][ma][mb];
-      double jjjmambyarray_r = yarray_r[j][ma][mb];
-      double jjjmambyarray_i = yarray_i[j][ma][mb];
+      double* dudr_r = dulist_r[jju];
+      double* dudr_i = dulist_i[jju];
+      double jjjmambyarray_r = ylist_r[jju];
+      double jjjmambyarray_i = ylist_i[jju];
+
       for(int k = 0; k < 3; k++)
         dedr[k] += 
           (dudr_r[k] * jjjmambyarray_r +
            dudr_i[k] * jjjmambyarray_i)*0.5;
-        
+      jju++;
+
     } // end if jeven
 
-  } // End loop over j
+  } // end loop over j
 
   for(int k = 0; k < 3; k++)
     dedr[k] *= 2.0;
@@ -562,103 +597,41 @@ void SNA::compute_bi()
   //            b(j1,j2,j) +=
   //              2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb)
 
-  for(int j1 = 0; j1 <= twojmax; j1++)
-    for(int j2 = 0; j2 <= j1; j2++) {
-      for(int j = abs(j1 - j2);
-          j <= MIN(twojmax, j1 + j2); j += 2) {
-        barray[j1][j2][j] = 0.0;
+  for(int jjb = 0; jjb < idxb_max; jjb++) {
+    const int j1 = idxb[jjb].j1;
+    const int j2 = idxb[jjb].j2;
+    const int j = idxb[jjb].j;
 
-        for(int mb = 0; 2*mb < j; mb++)
-          for(int ma = 0; ma <= j; ma++)
-            barray[j1][j2][j] +=
-              uarraytot_r[j][ma][mb] * zarray_r[j1][j2][j][ma][mb] +
-              uarraytot_i[j][ma][mb] * zarray_i[j1][j2][j][ma][mb];
+    int jjz = idxz_block[j1][j2][j];
+    int jju = idxu_block[j];
+    double sumzu = 0.0;
+    for (int mb = 0; 2*mb < j; mb++)
+      for (int ma = 0; ma <= j; ma++) {
+        sumzu += ulisttot_r[jju]*zlist_r[jjz] + 
+          ulisttot_i[jju]*zlist_i[jjz];
+        jjz++;
+        jju++;
+      } // end loop over ma, mb                                                                                                   
 
-        // For j even, special treatment for middle column
+    // For j even, handle middle column
 
-        if (j%2 == 0) {
-          int mb = j/2;
-          for(int ma = 0; ma < mb; ma++)
-            barray[j1][j2][j] +=
-              uarraytot_r[j][ma][mb] * zarray_r[j1][j2][j][ma][mb] +
-              uarraytot_i[j][ma][mb] * zarray_i[j1][j2][j][ma][mb];
-          int ma = mb;
-          barray[j1][j2][j] +=
-            (uarraytot_r[j][ma][mb] * zarray_r[j1][j2][j][ma][mb] +
-             uarraytot_i[j][ma][mb] * zarray_i[j1][j2][j][ma][mb])*0.5;
-        }
-
-        barray[j1][j2][j] *= 2.0;
-        if (bzero_flag)
-          barray[j1][j2][j] -= bzero[j];
+    if (j%2 == 0) {
+      int mb = j/2;
+      for(int ma = 0; ma < mb; ma++) {
+        sumzu += ulisttot_r[jju]*zlist_r[jjz] + 
+          ulisttot_i[jju]*zlist_i[jjz];
+        jjz++;
+        jju++;
       }
-    }
 
-}
-
-/* ----------------------------------------------------------------------
-   copy Bi array to a vector
-------------------------------------------------------------------------- */
-
-void SNA::copy_bi2bvec()
-{
-  int ncount, j1, j2, j;
-
-  ncount = 0;
-
-  for(j1 = 0; j1 <= twojmax; j1++)
-    if(diagonalstyle == 0) {
-      for(j2 = 0; j2 <= j1; j2++)
-        for(j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2) {
-          bvec[ncount] = barray[j1][j2][j];
-          ncount++;
-        }
-    } else if(diagonalstyle == 1) {
-      j2 = j1;
-      for(j = abs(j1 - j2);
-          j <= MIN(twojmax, j1 + j2); j += 2) {
-        bvec[ncount] = barray[j1][j2][j];
-        ncount++;
-      }
-    } else if(diagonalstyle == 2) {
-      j = j2 = j1;
-      bvec[ncount] = barray[j1][j2][j];
-      ncount++;
-    } else if(diagonalstyle == 3) {
-      for(j2 = 0; j2 <= j1; j2++)
-        for(j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-          if (j >= j1) {
-            bvec[ncount] = barray[j1][j2][j];
-            ncount++;
-          }
-    }
-}
-
-/* ----------------------------------------------------------------------
-   calculate derivative of Ui w.r.t. atom j
-------------------------------------------------------------------------- */
-
-void SNA::compute_duidrj(double* rij, double wj, double rcut)
-{
-  double rsq, r, x, y, z, z0, theta0, cs, sn;
-  double dz0dr;
-
-  x = rij[0];
-  y = rij[1];
-  z = rij[2];
-  rsq = x * x + y * y + z * z;
-  r = sqrt(rsq);
-  double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
-  theta0 = (r - rmin0) * rscale0;
-  cs = cos(theta0);
-  sn = sin(theta0);
-  z0 = r * cs / sn;
-  dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
-
-  compute_duarray(x, y, z, z0, r, dz0dr, wj, rcut);
+      sumzu += 0.5*(ulisttot_r[jju]*zlist_r[jjz] + 
+                   ulisttot_i[jju]*zlist_i[jjz]);
+      jjz++;
+      jju++;
+    } // end if jeven
 
+    blist[jjb] = 2.0*sumzu;
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -698,44 +671,36 @@ void SNA::compute_dbidrj()
   double** jjjzarray_i;
   double jjjmambzarray_r;
   double jjjmambzarray_i;
+  int jjz, jju;
 
   for(int jjb = 0; jjb < idxb_max; jjb++) {
     const int j1 = idxb[jjb].j1;
     const int j2 = idxb[jjb].j2;
     const int j = idxb[jjb].j;
 
-    dbdr = dbarray[j1][j2][j];
+    dbdr = dblist[jjb];
     dbdr[0] = 0.0;
     dbdr[1] = 0.0;
     dbdr[2] = 0.0;
 
     // Sum terms Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb)
 
+    jjz = idxz_block[j1][j2][j];
+    jju = idxu_block[j];
+
     for(int k = 0; k < 3; k++)
       sumzdu_r[k] = 0.0;
 
-    // use zarray j1/j2 symmetry (optional)
-
-    if (j1 >= j2) {
-      jjjzarray_r = zarray_r[j1][j2][j];
-      jjjzarray_i = zarray_i[j1][j2][j];
-    } else {
-      jjjzarray_r = zarray_r[j2][j1][j];
-      jjjzarray_i = zarray_i[j2][j1][j];
-    }
-
     for(int mb = 0; 2*mb < j; mb++)
       for(int ma = 0; ma <= j; ma++) {
-
-        dudr_r = duarray_r[j][ma][mb];
-        dudr_i = duarray_i[j][ma][mb];
-        jjjmambzarray_r = jjjzarray_r[ma][mb];
-        jjjmambzarray_i = jjjzarray_i[ma][mb];
+        dudr_r = dulist_r[jju];
+        dudr_i = dulist_i[jju];
         for(int k = 0; k < 3; k++)
           sumzdu_r[k] +=
-            dudr_r[k] * jjjmambzarray_r +
-            dudr_i[k] * jjjmambzarray_i;
-
+            dudr_r[k] * zlist_r[jjz] +
+            dudr_i[k] * zlist_i[jjz];
+        jjz++;
+        jju++;
       } //end loop over ma mb
 
     // For j even, handle middle column
@@ -743,24 +708,24 @@ void SNA::compute_dbidrj()
     if (j%2 == 0) {
       int mb = j/2;
       for(int ma = 0; ma < mb; ma++) {
-        dudr_r = duarray_r[j][ma][mb];
-        dudr_i = duarray_i[j][ma][mb];
-        jjjmambzarray_r = jjjzarray_r[ma][mb];
-        jjjmambzarray_i = jjjzarray_i[ma][mb];
+        dudr_r = dulist_r[jju];
+        dudr_i = dulist_i[jju];
         for(int k = 0; k < 3; k++)
           sumzdu_r[k] +=
-            dudr_r[k] * jjjmambzarray_r +
-            dudr_i[k] * jjjmambzarray_i;
+            dudr_r[k] * zlist_r[jjz] +
+            dudr_i[k] * zlist_i[jjz];
+        jjz++;
+        jju++;
       }
       int ma = mb;
-      dudr_r = duarray_r[j][ma][mb];
-      dudr_i = duarray_i[j][ma][mb];
-      jjjmambzarray_r = jjjzarray_r[ma][mb];
-      jjjmambzarray_i = jjjzarray_i[ma][mb];
+      dudr_r = dulist_r[jju];
+      dudr_i = dulist_i[jju];
       for(int k = 0; k < 3; k++)
         sumzdu_r[k] +=
-          (dudr_r[k] * jjjmambzarray_r +
-           dudr_i[k] * jjjmambzarray_i)*0.5;
+          (dudr_r[k] * zlist_r[jjz] +
+           dudr_i[k] * zlist_i[jjz])*0.5;
+      jjz++;
+      jju++;
     } // end if jeven
 
     for(int k = 0; k < 3; k++)
@@ -770,115 +735,97 @@ void SNA::compute_dbidrj()
 
     double j1fac = (j+1)/(j1+1.0);
 
+    jjz = idxz_block[j][j2][j1];
+    jju = idxu_block[j1];
+
     for(int k = 0; k < 3; k++)
       sumzdu_r[k] = 0.0;
 
-    // use zarray j1/j2 symmetry (optional)
-
-    if (j >= j2) {
-      jjjzarray_r = zarray_r[j][j2][j1];
-      jjjzarray_i = zarray_i[j][j2][j1];
-    } else {
-      jjjzarray_r = zarray_r[j2][j][j1];
-      jjjzarray_i = zarray_i[j2][j][j1];
-    }
-
-    for(int mb1 = 0; 2*mb1 < j1; mb1++)
-      for(int ma1 = 0; ma1 <= j1; ma1++) {
-
-        dudr_r = duarray_r[j1][ma1][mb1];
-        dudr_i = duarray_i[j1][ma1][mb1];
-        jjjmambzarray_r = jjjzarray_r[ma1][mb1];
-        jjjmambzarray_i = jjjzarray_i[ma1][mb1];
+    for(int mb = 0; 2*mb < j1; mb++)
+      for(int ma = 0; ma <= j1; ma++) {
+        dudr_r = dulist_r[jju];
+        dudr_i = dulist_i[jju];
         for(int k = 0; k < 3; k++)
           sumzdu_r[k] +=
-            dudr_r[k] * jjjmambzarray_r +
-            dudr_i[k] * jjjmambzarray_i;
-
-      } //end loop over ma1 mb1
+            dudr_r[k] * zlist_r[jjz] +
+            dudr_i[k] * zlist_i[jjz];
+        jjz++;
+        jju++;
+      } //end loop over ma mb
 
     // For j1 even, handle middle column
 
     if (j1%2 == 0) {
-      int mb1 = j1/2;
-      for(int ma1 = 0; ma1 < mb1; ma1++) {
-        dudr_r = duarray_r[j1][ma1][mb1];
-        dudr_i = duarray_i[j1][ma1][mb1];
-        jjjmambzarray_r = jjjzarray_r[ma1][mb1];
-        jjjmambzarray_i = jjjzarray_i[ma1][mb1];
+      int mb = j1/2;
+      for(int ma = 0; ma < mb; ma++) {
+        dudr_r = dulist_r[jju];
+        dudr_i = dulist_i[jju];
         for(int k = 0; k < 3; k++)
           sumzdu_r[k] +=
-            dudr_r[k] * jjjmambzarray_r +
-            dudr_i[k] * jjjmambzarray_i;
+            dudr_r[k] * zlist_r[jjz] +
+            dudr_i[k] * zlist_i[jjz];
+        jjz++;
+        jju++;
       }
-      int ma1 = mb1;
-      dudr_r = duarray_r[j1][ma1][mb1];
-      dudr_i = duarray_i[j1][ma1][mb1];
-      jjjmambzarray_r = jjjzarray_r[ma1][mb1];
-      jjjmambzarray_i = jjjzarray_i[ma1][mb1];
+      int ma = mb;
+      dudr_r = dulist_r[jju];
+      dudr_i = dulist_i[jju];
       for(int k = 0; k < 3; k++)
         sumzdu_r[k] +=
-          (dudr_r[k] * jjjmambzarray_r +
-           dudr_i[k] * jjjmambzarray_i)*0.5;
+          (dudr_r[k] * zlist_r[jjz] +
+           dudr_i[k] * zlist_i[jjz])*0.5;
+      jjz++;
+      jju++;
     } // end if j1even
 
     for(int k = 0; k < 3; k++)
       dbdr[k] += 2.0*sumzdu_r[k]*j1fac;
 
-    // Sum over Conj(dudr(j2,ma2,mb2))*z(j1,j,j2,ma2,mb2)
+    // Sum over Conj(dudr(j2,ma2,mb2))*z(j,j1,j2,ma2,mb2)
 
     double j2fac = (j+1)/(j2+1.0);
 
+    jjz = idxz_block[j][j1][j2];
+    jju = idxu_block[j2];
+
     for(int k = 0; k < 3; k++)
       sumzdu_r[k] = 0.0;
 
-    // use zarray j1/j2 symmetry (optional)
-
-    if (j1 >= j) {
-      jjjzarray_r = zarray_r[j1][j][j2];
-      jjjzarray_i = zarray_i[j1][j][j2];
-    } else {
-      jjjzarray_r = zarray_r[j][j1][j2];
-      jjjzarray_i = zarray_i[j][j1][j2];
-    }
-
-    for(int mb2 = 0; 2*mb2 < j2; mb2++)
-      for(int ma2 = 0; ma2 <= j2; ma2++) {
-
-        dudr_r = duarray_r[j2][ma2][mb2];
-        dudr_i = duarray_i[j2][ma2][mb2];
-        jjjmambzarray_r = jjjzarray_r[ma2][mb2];
-        jjjmambzarray_i = jjjzarray_i[ma2][mb2];
+    for(int mb = 0; 2*mb < j2; mb++)
+      for(int ma = 0; ma <= j2; ma++) {
+        dudr_r = dulist_r[jju];
+        dudr_i = dulist_i[jju];
         for(int k = 0; k < 3; k++)
           sumzdu_r[k] +=
-            dudr_r[k] * jjjmambzarray_r +
-            dudr_i[k] * jjjmambzarray_i;
-
-      } //end loop over ma2 mb2
+            dudr_r[k] * zlist_r[jjz] +
+            dudr_i[k] * zlist_i[jjz];
+        jjz++;
+        jju++;
+      } //end loop over ma mb
 
     // For j2 even, handle middle column
 
     if (j2%2 == 0) {
-      int mb2 = j2/2;
-      for(int ma2 = 0; ma2 < mb2; ma2++) {
-        dudr_r = duarray_r[j2][ma2][mb2];
-        dudr_i = duarray_i[j2][ma2][mb2];
-        jjjmambzarray_r = jjjzarray_r[ma2][mb2];
-        jjjmambzarray_i = jjjzarray_i[ma2][mb2];
+      int mb = j2/2;
+      for(int ma = 0; ma < mb; ma++) {
+        dudr_r = dulist_r[jju];
+        dudr_i = dulist_i[jju];
         for(int k = 0; k < 3; k++)
           sumzdu_r[k] +=
-            dudr_r[k] * jjjmambzarray_r +
-            dudr_i[k] * jjjmambzarray_i;
+            dudr_r[k] * zlist_r[jjz] +
+            dudr_i[k] * zlist_i[jjz];
+        jjz++;
+        jju++;
       }
-      int ma2 = mb2;
-      dudr_r = duarray_r[j2][ma2][mb2];
-      dudr_i = duarray_i[j2][ma2][mb2];
-      jjjmambzarray_r = jjjzarray_r[ma2][mb2];
-      jjjmambzarray_i = jjjzarray_i[ma2][mb2];
+      int ma = mb;
+      dudr_r = dulist_r[jju];
+      dudr_i = dulist_i[jju];
       for(int k = 0; k < 3; k++)
         sumzdu_r[k] +=
-          (dudr_r[k] * jjjmambzarray_r +
-           dudr_i[k] * jjjmambzarray_i)*0.5;
+          (dudr_r[k] * zlist_r[jjz] +
+           dudr_i[k] * zlist_i[jjz])*0.5;
+      jjz++;
+      jju++;
     } // end if j2even
 
     for(int k = 0; k < 3; k++)
@@ -889,75 +836,56 @@ void SNA::compute_dbidrj()
 }
 
 /* ----------------------------------------------------------------------
-   copy Bi derivatives into a vector
+   calculate derivative of Ui w.r.t. atom j
 ------------------------------------------------------------------------- */
 
-void SNA::copy_dbi2dbvec()
+void SNA::compute_duidrj(double* rij, double wj, double rcut)
 {
-  int ncount, j1, j2, j;
+  double rsq, r, x, y, z, z0, theta0, cs, sn;
+  double dz0dr;
 
-  ncount = 0;
+  x = rij[0];
+  y = rij[1];
+  z = rij[2];
+  rsq = x * x + y * y + z * z;
+  r = sqrt(rsq);
+  double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
+  theta0 = (r - rmin0) * rscale0;
+  cs = cos(theta0);
+  sn = sin(theta0);
+  z0 = r * cs / sn;
+  dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
 
-  for(j1 = 0; j1 <= twojmax; j1++) {
-    if(diagonalstyle == 0) {
-      for(j2 = 0; j2 <= j1; j2++)
-        for(j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2) {
-          dbvec[ncount][0] = dbarray[j1][j2][j][0];
-          dbvec[ncount][1] = dbarray[j1][j2][j][1];
-          dbvec[ncount][2] = dbarray[j1][j2][j][2];
-          ncount++;
-        }
-    } else if(diagonalstyle == 1) {
-      j2 = j1;
-      for(j = abs(j1 - j2);
-          j <= MIN(twojmax, j1 + j2); j += 2) {
-        dbvec[ncount][0] = dbarray[j1][j2][j][0];
-        dbvec[ncount][1] = dbarray[j1][j2][j][1];
-        dbvec[ncount][2] = dbarray[j1][j2][j][2];
-        ncount++;
-      }
-    } else if(diagonalstyle == 2) {
-      j = j2 = j1;
-      dbvec[ncount][0] = dbarray[j1][j2][j][0];
-      dbvec[ncount][1] = dbarray[j1][j2][j][1];
-      dbvec[ncount][2] = dbarray[j1][j2][j][2];
-      ncount++;
-    } else if(diagonalstyle == 3) {
-      for(j2 = 0; j2 <= j1; j2++)
-        for(j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-          if (j >= j1) {
-            dbvec[ncount][0] = dbarray[j1][j2][j][0];
-            dbvec[ncount][1] = dbarray[j1][j2][j][1];
-            dbvec[ncount][2] = dbarray[j1][j2][j][2];
-            ncount++;
-          }
-    }
-  }
+  compute_duarray(x, y, z, z0, r, dz0dr, wj, rcut);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void SNA::zero_uarraytot()
 {
-  for (int j = 0; j <= twojmax; j++)
-    for (int ma = 0; ma <= j; ma++)
-      for (int mb = 0; mb <= j; mb++) {
-        uarraytot_r[j][ma][mb] = 0.0;
-        uarraytot_i[j][ma][mb] = 0.0;
+  for (int j = 0; j <= twojmax; j++) {
+    int jju = idxu_block[j];
+    for (int mb = 0; mb <= j; mb++)
+      for (int ma = 0; ma <= j; ma++) {
+        ulisttot_r[jju] = 0.0;
+        ulisttot_i[jju] = 0.0;
+        jju++;
       }
+  }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void SNA::addself_uarraytot(double wself_in)
 {
-  for (int j = 0; j <= twojmax; j++)
+  for (int j = 0; j <= twojmax; j++) {
+    int jju = idxu_block[j];
     for (int ma = 0; ma <= j; ma++) {
-      uarraytot_r[j][ma][ma] = wself_in;
-      uarraytot_i[j][ma][ma] = 0.0;
+      ulisttot_r[jju] = wself_in;
+      ulisttot_i[jju] = 0.0;
+      jju += j+2;
     }
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -972,14 +900,17 @@ void SNA::add_uarraytot(double r, double wj, double rcut)
 
   sfac *= wj;
 
-  for (int j = 0; j <= twojmax; j++)
-    for (int ma = 0; ma <= j; ma++)
-      for (int mb = 0; mb <= j; mb++) {
-        uarraytot_r[j][ma][mb] +=
-          sfac * uarray_r[j][ma][mb];
-        uarraytot_i[j][ma][mb] +=
-          sfac * uarray_i[j][ma][mb];
+  for (int j = 0; j <= twojmax; j++) {
+    int jju = idxu_block[j];
+    for (int mb = 0; mb <= j; mb++)
+      for (int ma = 0; ma <= j; ma++) {
+        ulisttot_r[jju] +=
+          sfac * ulist_r[jju];
+        ulisttot_i[jju] +=
+          sfac * ulist_i[jju];
+        jju++;
       }
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -1003,63 +934,72 @@ void SNA::compute_uarray(double x, double y, double z,
 
   // VMK Section 4.8.2
 
-  uarray_r[0][0][0] = 1.0;
-  uarray_i[0][0][0] = 0.0;
+  ulist_r[0] = 1.0;
+  ulist_i[0] = 0.0;
 
   for (int j = 1; j <= twojmax; j++) {
+    int jju = idxu_block[j];
+    int jjup = idxu_block[j-1];
 
     // fill in left side of matrix layer from previous layer
 
     for (int mb = 0; 2*mb <= j; mb++) {
-      uarray_r[j][0][mb] = 0.0;
-      uarray_i[j][0][mb] = 0.0;
+      ulist_r[jju] = 0.0;
+      ulist_i[jju] = 0.0;
 
       for (int ma = 0; ma < j; ma++) {
         rootpq = rootpqarray[j - ma][j - mb];
-        uarray_r[j][ma][mb] +=
+        ulist_r[jju] +=
           rootpq *
-          (a_r * uarray_r[j - 1][ma][mb] +
-           a_i * uarray_i[j - 1][ma][mb]);
-        uarray_i[j][ma][mb] +=
+          (a_r * ulist_r[jjup] +
+           a_i * ulist_i[jjup]);
+        ulist_i[jju] +=
           rootpq *
-          (a_r * uarray_i[j - 1][ma][mb] -
-           a_i * uarray_r[j - 1][ma][mb]);
+          (a_r * ulist_i[jjup] -
+           a_i * ulist_r[jjup]);
 
         rootpq = rootpqarray[ma + 1][j - mb];
-        uarray_r[j][ma + 1][mb] =
+        ulist_r[jju+1] =
           -rootpq *
-          (b_r * uarray_r[j - 1][ma][mb] +
-           b_i * uarray_i[j - 1][ma][mb]);
-        uarray_i[j][ma + 1][mb] =
+          (b_r * ulist_r[jjup] +
+           b_i * ulist_i[jjup]);
+        ulist_i[jju+1] =
           -rootpq *
-          (b_r * uarray_i[j - 1][ma][mb] -
-           b_i * uarray_r[j - 1][ma][mb]);
+          (b_r * ulist_i[jjup] -
+           b_i * ulist_r[jjup]);
+        jju++;
+        jjup++;
       }
+      jju++;
     }
 
     // copy left side to right side with inversion symmetry VMK 4.4(2)
     // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
 
-    int mbpar = -1;
+    jju = idxu_block[j];
+    jjup = jju+(j+1)*(j+1)-1;
+    int mbpar = 1;
     for (int mb = 0; 2*mb <= j; mb++) {
-      mbpar = -mbpar;
-      int mapar = -mbpar;
+      int mapar = mbpar;
       for (int ma = 0; ma <= j; ma++) {
-        mapar = -mapar;
         if (mapar == 1) {
-          uarray_r[j][j-ma][j-mb] = uarray_r[j][ma][mb];
-          uarray_i[j][j-ma][j-mb] = -uarray_i[j][ma][mb];
+          ulist_r[jjup] = ulist_r[jju];
+          ulist_i[jjup] = -ulist_i[jju];
         } else {
-          uarray_r[j][j-ma][j-mb] = -uarray_r[j][ma][mb];
-          uarray_i[j][j-ma][j-mb] = uarray_i[j][ma][mb];
+          ulist_r[jjup] = -ulist_r[jju];
+          ulist_i[jjup] = ulist_i[jju];
         }
+        mapar = -mapar;
+        jju++;
+        jjup--;
       }
+      mbpar = -mbpar;
     }
   }
 }
 
 /* ----------------------------------------------------------------------
-   compute derivatives of Wigner U-functions for one neighbor
+   Compute derivatives of Wigner U-functions for one neighbor
    see comments in compute_uarray()
 ------------------------------------------------------------------------- */
 
@@ -1109,93 +1049,105 @@ void SNA::compute_duarray(double x, double y, double z,
   db_i[0] += -r0inv;
   db_r[1] += r0inv;
 
-  uarray_r[0][0][0] = 1.0;
-  duarray_r[0][0][0][0] = 0.0;
-  duarray_r[0][0][0][1] = 0.0;
-  duarray_r[0][0][0][2] = 0.0;
-  uarray_i[0][0][0] = 0.0;
-  duarray_i[0][0][0][0] = 0.0;
-  duarray_i[0][0][0][1] = 0.0;
-  duarray_i[0][0][0][2] = 0.0;
+  ulist_r[0] = 1.0;
+  dulist_r[0][0] = 0.0;
+  dulist_r[0][1] = 0.0;
+  dulist_r[0][2] = 0.0;
+  ulist_i[0] = 0.0;
+  dulist_i[0][0] = 0.0;
+  dulist_i[0][1] = 0.0;
+  dulist_i[0][2] = 0.0;
 
   for (int j = 1; j <= twojmax; j++) {
+    int jju = idxu_block[j];
+    int jjup = idxu_block[j-1];
     for (int mb = 0; 2*mb <= j; mb++) {
-      uarray_r[j][0][mb] = 0.0;
-      duarray_r[j][0][mb][0] = 0.0;
-      duarray_r[j][0][mb][1] = 0.0;
-      duarray_r[j][0][mb][2] = 0.0;
-      uarray_i[j][0][mb] = 0.0;
-      duarray_i[j][0][mb][0] = 0.0;
-      duarray_i[j][0][mb][1] = 0.0;
-      duarray_i[j][0][mb][2] = 0.0;
+      ulist_r[jju] = 0.0;
+      dulist_r[jju][0] = 0.0;
+      dulist_r[jju][1] = 0.0;
+      dulist_r[jju][2] = 0.0;
+      ulist_i[jju] = 0.0;
+      dulist_i[jju][0] = 0.0;
+      dulist_i[jju][1] = 0.0;
+      dulist_i[jju][2] = 0.0;
 
       for (int ma = 0; ma < j; ma++) {
         rootpq = rootpqarray[j - ma][j - mb];
-        uarray_r[j][ma][mb] += rootpq *
-                               (a_r *  uarray_r[j - 1][ma][mb] +
-                                a_i *  uarray_i[j - 1][ma][mb]);
-        uarray_i[j][ma][mb] += rootpq *
-                               (a_r *  uarray_i[j - 1][ma][mb] -
-                                a_i *  uarray_r[j - 1][ma][mb]);
+        ulist_r[jju] += rootpq *
+                               (a_r *  ulist_r[jjup] +
+                                a_i *  ulist_i[jjup]);
+        ulist_i[jju] += rootpq *
+                               (a_r *  ulist_i[jjup] -
+                                a_i *  ulist_r[jjup]);
 
         for (int k = 0; k < 3; k++) {
-          duarray_r[j][ma][mb][k] +=
-            rootpq * (da_r[k] * uarray_r[j - 1][ma][mb] +
-                      da_i[k] * uarray_i[j - 1][ma][mb] +
-                      a_r * duarray_r[j - 1][ma][mb][k] +
-                      a_i * duarray_i[j - 1][ma][mb][k]);
-          duarray_i[j][ma][mb][k] +=
-            rootpq * (da_r[k] * uarray_i[j - 1][ma][mb] -
-                      da_i[k] * uarray_r[j - 1][ma][mb] +
-                      a_r * duarray_i[j - 1][ma][mb][k] -
-                      a_i * duarray_r[j - 1][ma][mb][k]);
+          dulist_r[jju][k] +=
+            rootpq * (da_r[k] * ulist_r[jjup] +
+                      da_i[k] * ulist_i[jjup] +
+                      a_r * dulist_r[jjup][k] +
+                      a_i * dulist_i[jjup][k]);
+          dulist_i[jju][k] +=
+            rootpq * (da_r[k] * ulist_i[jjup] -
+                      da_i[k] * ulist_r[jjup] +
+                      a_r * dulist_i[jjup][k] -
+                      a_i * dulist_r[jjup][k]);
         }
 
         rootpq = rootpqarray[ma + 1][j - mb];
-        uarray_r[j][ma + 1][mb] =
-          -rootpq * (b_r *  uarray_r[j - 1][ma][mb] +
-                     b_i *  uarray_i[j - 1][ma][mb]);
-        uarray_i[j][ma + 1][mb] =
-          -rootpq * (b_r *  uarray_i[j - 1][ma][mb] -
-                     b_i *  uarray_r[j - 1][ma][mb]);
+        ulist_r[jju+1] =
+          -rootpq * (b_r *  ulist_r[jjup] +
+                     b_i *  ulist_i[jjup]);
+        ulist_i[jju+1] =
+          -rootpq * (b_r *  ulist_i[jjup] -
+                     b_i *  ulist_r[jjup]);
 
         for (int k = 0; k < 3; k++) {
-          duarray_r[j][ma + 1][mb][k] =
-            -rootpq * (db_r[k] * uarray_r[j - 1][ma][mb] +
-                       db_i[k] * uarray_i[j - 1][ma][mb] +
-                       b_r * duarray_r[j - 1][ma][mb][k] +
-                       b_i * duarray_i[j - 1][ma][mb][k]);
-          duarray_i[j][ma + 1][mb][k] =
-            -rootpq * (db_r[k] * uarray_i[j - 1][ma][mb] -
-                       db_i[k] * uarray_r[j - 1][ma][mb] +
-                       b_r * duarray_i[j - 1][ma][mb][k] -
-                       b_i * duarray_r[j - 1][ma][mb][k]);
+          dulist_r[jju+1][k] =
+            -rootpq * (db_r[k] * ulist_r[jjup] +
+                       db_i[k] * ulist_i[jjup] +
+                       b_r * dulist_r[jjup][k] +
+                       b_i * dulist_i[jjup][k]);
+          dulist_i[jju+1][k] =
+            -rootpq * (db_r[k] * ulist_i[jjup] -
+                       db_i[k] * ulist_r[jjup] +
+                       b_r * dulist_i[jjup][k] -
+                       b_i * dulist_r[jjup][k]);
         }
+        jju++;
+        jjup++;
       }
+      jju++;
     }
 
-    int mbpar = -1;
+    // copy left side to right side with inversion symmetry VMK 4.4(2)
+    // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
+
+    jju = idxu_block[j];
+    jjup = jju+(j+1)*(j+1)-1;
+    int mbpar = 1;
     for (int mb = 0; 2*mb <= j; mb++) {
-      mbpar = -mbpar;
-      int mapar = -mbpar;
+      int mapar = mbpar;
       for (int ma = 0; ma <= j; ma++) {
-        mapar = -mapar;
         if (mapar == 1) {
-          uarray_r[j][j-ma][j-mb] = uarray_r[j][ma][mb];
-          uarray_i[j][j-ma][j-mb] = -uarray_i[j][ma][mb];
+          ulist_r[jjup] = ulist_r[jju];
+          ulist_i[jjup] = -ulist_i[jju];
           for (int k = 0; k < 3; k++) {
-            duarray_r[j][j-ma][j-mb][k] = duarray_r[j][ma][mb][k];
-            duarray_i[j][j-ma][j-mb][k] = -duarray_i[j][ma][mb][k];
+            dulist_r[jjup][k] = dulist_r[jju][k];
+            dulist_i[jjup][k] = -dulist_i[jju][k];
           }
         } else {
-          uarray_r[j][j-ma][j-mb] = -uarray_r[j][ma][mb];
-          uarray_i[j][j-ma][j-mb] = uarray_i[j][ma][mb];
+          ulist_r[jjup] = -ulist_r[jju];
+          ulist_i[jjup] = ulist_i[jju];
           for (int k = 0; k < 3; k++) {
-            duarray_r[j][j-ma][j-mb][k] = -duarray_r[j][ma][mb][k];
-            duarray_i[j][j-ma][j-mb][k] = duarray_i[j][ma][mb][k];
+            dulist_r[jjup][k] = -dulist_r[jju][k];
+            dulist_i[jjup][k] = dulist_i[jju][k];
           }
         }
+        mapar = -mapar;
+        jju++;
+        jjup--;
       }
+      mbpar = -mbpar;
     }
   }
 
@@ -1204,23 +1156,25 @@ void SNA::compute_duarray(double x, double y, double z,
 
   sfac *= wj;
   dsfac *= wj;
-
-  for (int j = 0; j <= twojmax; j++)
-    for (int ma = 0; ma <= j; ma++)
-      for (int mb = 0; mb <= j; mb++) {
-        duarray_r[j][ma][mb][0] = dsfac * uarray_r[j][ma][mb] * ux +
-                                  sfac * duarray_r[j][ma][mb][0];
-        duarray_i[j][ma][mb][0] = dsfac * uarray_i[j][ma][mb] * ux +
-                                  sfac * duarray_i[j][ma][mb][0];
-        duarray_r[j][ma][mb][1] = dsfac * uarray_r[j][ma][mb] * uy +
-                                  sfac * duarray_r[j][ma][mb][1];
-        duarray_i[j][ma][mb][1] = dsfac * uarray_i[j][ma][mb] * uy +
-                                  sfac * duarray_i[j][ma][mb][1];
-        duarray_r[j][ma][mb][2] = dsfac * uarray_r[j][ma][mb] * uz +
-                                  sfac * duarray_r[j][ma][mb][2];
-        duarray_i[j][ma][mb][2] = dsfac * uarray_i[j][ma][mb] * uz +
-                                  sfac * duarray_i[j][ma][mb][2];
+  for (int j = 0; j <= twojmax; j++) {
+    int jju = idxu_block[j];
+    for (int mb = 0; 2*mb <= j; mb++)
+      for (int ma = 0; ma <= j; ma++) {
+        dulist_r[jju][0] = dsfac * ulist_r[jju] * ux +
+                                  sfac * dulist_r[jju][0];
+        dulist_i[jju][0] = dsfac * ulist_i[jju] * ux +
+                                  sfac * dulist_i[jju][0];
+        dulist_r[jju][1] = dsfac * ulist_r[jju] * uy +
+                                  sfac * dulist_r[jju][1];
+        dulist_i[jju][1] = dsfac * ulist_i[jju] * uy +
+                                  sfac * dulist_i[jju][1];
+        dulist_r[jju][2] = dsfac * ulist_r[jju] * uz +
+                                  sfac * dulist_r[jju][2];
+        dulist_i[jju][2] = dsfac * ulist_i[jju] * uz +
+                                  sfac * dulist_i[jju][2];
+        jju++;
       }
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -1229,89 +1183,89 @@ void SNA::compute_duarray(double x, double y, double z,
 
 double SNA::memory_usage()
 {
+  int jdimpq = twojmax + 2;
   int jdim = twojmax + 1;
   double bytes;
-  bytes = jdim * jdim * jdim * jdim * jdim * sizeof(double);
-  bytes += 2 * jdim * jdim * jdim * sizeof(complex<double>);
-  bytes += 2 * jdim * jdim * jdim * sizeof(double);
-  bytes += jdim * jdim * jdim * 3 * sizeof(complex<double>);
-  bytes += jdim * jdim * jdim * 3 * sizeof(double);
-  bytes += ncoeff * sizeof(double);
-  bytes += jdim * jdim * jdim * jdim * jdim * sizeof(complex<double>);
+  bytes = ncoeff * sizeof(double);                       // coeff
+
+  bytes += jdimpq*jdimpq * sizeof(double);               // pqarray
+  bytes += idxcg_max * sizeof(double);                   // cglist
+  bytes += jdim * jdim * jdim * sizeof(int);                // idxcg_block
+
+  bytes += idxu_max * sizeof(double) * 2;                // ulist
+  bytes += idxu_max * sizeof(double) * 2;                // ulisttot
+  bytes += idxu_max * 3 * sizeof(double) * 2;            // dulist
+  bytes += jdim * sizeof(int);                              // idxu_block
+
+  bytes += idxz_max * 9 * sizeof(int);                      // idxz
+  bytes += idxz_max * sizeof(double) * 2;                // zlist
+  bytes += jdim * jdim * jdim * sizeof(int);                // idxz_block
+
+  bytes += idxu_max * sizeof(double) * 2;                // ylist
+  bytes += idxb_max * 3 * sizeof(int);                      // idxb
+
+  bytes += jdim * jdim * jdim * sizeof(int);                // idxb_block
+
   return bytes;
 }
-
 /* ---------------------------------------------------------------------- */
 
 void SNA::create_twojmax_arrays()
 {
-  int jdim = twojmax + 1;
-
-  memory->create(cgarray, jdim, jdim, jdim, jdim, jdim,
-                 "sna:cgarray");
-  memory->create(rootpqarray, jdim+1, jdim+1,
+  int jdimpq = twojmax + 2;
+  memory->create(rootpqarray, jdimpq, jdimpq,
                  "sna:rootpqarray");
-  memory->create(barray, jdim, jdim, jdim,
-                 "sna:barray");
-  memory->create(dbarray, jdim, jdim, jdim, 3,
-                 "sna:dbarray");
-
-  memory->create(duarray_r, jdim, jdim, jdim, 3,
-                 "sna:duarray");
-  memory->create(duarray_i, jdim, jdim, jdim, 3,
-                 "sna:duarray");
-
-  memory->create(uarray_r, jdim, jdim, jdim,
-                 "sna:uarray");
-  memory->create(uarray_i, jdim, jdim, jdim,
-                 "sna:uarray");
+  memory->create(cglist, idxcg_max, "sna:cglist");
+  memory->create(ulist_r, idxu_max, "sna:ulist");
+  memory->create(ulist_i, idxu_max, "sna:ulist");
+  memory->create(ulisttot_r, idxu_max, "sna:ulisttot");
+  memory->create(ulisttot_i, idxu_max, "sna:ulisttot");
+  memory->create(dulist_r, idxu_max, 3, "sna:dulist");
+  memory->create(dulist_i, idxu_max, 3, "sna:dulist");
+  memory->create(zlist_r, idxz_max, "sna:zlist");
+  memory->create(zlist_i, idxz_max, "sna:zlist");
+  memory->create(blist, idxb_max, "sna:blist");
+  memory->create(dblist, idxb_max, 3, "sna:dblist");
+  memory->create(ylist_r, idxu_max, "sna:ylist");
+  memory->create(ylist_i, idxu_max, "sna:ylist");
 
   if (bzero_flag)
-    memory->create(bzero, jdim,"sna:bzero");
+    memory->create(bzero, twojmax+1,"sna:bzero");
   else
     bzero = NULL;
 
-
-  memory->create(uarraytot_r, jdim, jdim, jdim,
-                 "sna:uarraytot");
-  memory->create(zarray_r, jdim, jdim, jdim, jdim, jdim,
-                 "sna:zarray");
-  memory->create(uarraytot_i, jdim, jdim, jdim,
-                 "sna:uarraytot");
-  memory->create(zarray_i, jdim, jdim, jdim, jdim, jdim,
-                 "sna:zarray");
-  memory->create(yarray_r, jdim, jdim, jdim,
-                 "sna:yarray");
-  memory->create(yarray_i, jdim, jdim, jdim,
-                 "sna:yarray");
-
 }
 
 /* ---------------------------------------------------------------------- */
 
 void SNA::destroy_twojmax_arrays()
 {
-  memory->destroy(cgarray);
   memory->destroy(rootpqarray);
-  memory->destroy(barray);
+  memory->destroy(cglist);
+  memory->destroy(idxcg_block);
 
-  memory->destroy(dbarray);
+  memory->destroy(ulist_r);
+  memory->destroy(ulist_i);
+  memory->destroy(ulisttot_r);
+  memory->destroy(ulisttot_i);
+  memory->destroy(dulist_r);
+  memory->destroy(dulist_i);
+  memory->destroy(idxu_block);
 
-  memory->destroy(duarray_r);
-  memory->destroy(duarray_i);
+  memory->destroy(zlist_r);
+  memory->destroy(zlist_i);
+  memory->destroy(blist);
+  memory->destroy(dblist);
+  memory->destroy(idxz_block);
 
-  memory->destroy(uarray_r);
-  memory->destroy(uarray_i);
+  memory->destroy(ylist_r);
+  memory->destroy(ylist_i);
+
+  memory->destroy(idxb_block);
 
   if (bzero_flag)
     memory->destroy(bzero);
 
-  memory->destroy(uarraytot_r);
-  memory->destroy(zarray_r);
-  memory->destroy(uarraytot_i);
-  memory->destroy(zarray_i);
-  memory->destroy(yarray_r);
-  memory->destroy(yarray_i);
 }
 
 /* ----------------------------------------------------------------------
@@ -1527,28 +1481,33 @@ void SNA::init_clebsch_gordan()
   int m, aa2, bb2, cc2;
   int ifac;
 
-  for (int j1 = 0; j1 <= twojmax; j1++)
-    for (int j2 = 0; j2 <= twojmax; j2++)
-      for (int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
-        for (int m1 = 0; m1 <= j1; m1 += 1) {
+  int idxcg_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+        for (int m1 = 0; m1 <= j1; m1++) {
           aa2 = 2 * m1 - j1;
 
-          for (int m2 = 0; m2 <= j2; m2 += 1) {
+          for (int m2 = 0; m2 <= j2; m2++) {
 
             // -c <= cc <= c
 
             bb2 = 2 * m2 - j2;
             m = (aa2 + bb2 + j) / 2;
 
-            if(m < 0 || m > j) continue;
+            if(m < 0 || m > j) {
+              cglist[idxcg_count] = 0.0;
+              idxcg_count++;
+              continue;
+            }
 
             sum = 0.0;
 
             for (int z = MAX(0, MAX(-(j - j2 + aa2)
-                                   / 2, -(j - j1 - bb2) / 2));
-                z <= MIN((j1 + j2 - j) / 2,
-                         MIN((j1 - aa2) / 2, (j2 + bb2) / 2));
-                z++) {
+                                    / 2, -(j - j1 - bb2) / 2));
+                 z <= MIN((j1 + j2 - j) / 2,
+                          MIN((j1 - aa2) / 2, (j2 + bb2) / 2));
+                 z++) {
               ifac = z % 2 ? -1 : 1;
               sum += ifac /
                 (factorial(z) *
@@ -1558,20 +1517,22 @@ void SNA::init_clebsch_gordan()
                  factorial((j - j2 + aa2) / 2 + z) *
                  factorial((j - j1 - bb2) / 2 + z));
             }
-
+            
             cc2 = 2 * m - j;
             dcg = deltacg(j1, j2, j);
             sfaccg = sqrt(factorial((j1 + aa2) / 2) *
-                        factorial((j1 - aa2) / 2) *
-                        factorial((j2 + bb2) / 2) *
-                        factorial((j2 - bb2) / 2) *
-                        factorial((j  + cc2) / 2) *
-                        factorial((j  - cc2) / 2) *
-                        (j + 1));
-
-            cgarray[j1][j2][j][m1][m2] = sum * dcg * sfaccg;
+                          factorial((j1 - aa2) / 2) *
+                          factorial((j2 + bb2) / 2) *
+                          factorial((j2 - bb2) / 2) *
+                          factorial((j  + cc2) / 2) *
+                          factorial((j  - cc2) / 2) *
+                          (j + 1));
+            
+            cglist[idxcg_count] = sum * dcg * sfaccg;
+            idxcg_count++;
           }
         }
+      }
 }
 
 /* ----------------------------------------------------------------------
@@ -1586,74 +1547,6 @@ void SNA::init_rootpqarray()
       rootpqarray[p][q] = sqrt(static_cast<double>(p)/q);
 }
 
-/* ----------------------------------------------------------------------
-   a = j/2
-------------------------------------------------------------------------- */
-
-void SNA::jtostr(char* str, int j)
-{
-  if(j % 2 == 0)
-    sprintf(str, "%d", j / 2);
-  else
-    sprintf(str, "%d/2", j);
-}
-
-/* ----------------------------------------------------------------------
-   aa = m - j/2
-------------------------------------------------------------------------- */
-
-void SNA::mtostr(char* str, int j, int m)
-{
-  if(j % 2 == 0)
-    sprintf(str, "%d", m - j / 2);
-  else
-    sprintf(str, "%d/2", 2 * m - j);
-}
-
-/* ----------------------------------------------------------------------
-   list values of Clebsch-Gordan coefficients
-   using notation of VMK Table 8.11
-------------------------------------------------------------------------- */
-
-void SNA::print_clebsch_gordan(FILE* file)
-{
-  char stra[20], strb[20], strc[20], straa[20], strbb[20], strcc[20];
-  int m, aa2, bb2;
-
-  fprintf(file, "a, aa, b, bb, c, cc, c(a,aa,b,bb,c,cc) \n");
-
-  for (int j1 = 0; j1 <= twojmax; j1++) {
-    jtostr(stra, j1);
-
-    for (int j2 = 0; j2 <= twojmax; j2++) {
-      jtostr(strb, j2);
-
-      for (int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
-        jtostr(strc, j);
-
-        for (int m1 = 0; m1 <= j1; m1 += 1) {
-          mtostr(straa, j1, m1);
-          aa2 = 2 * m1 - j1;
-
-          for (int m2 = 0; m2 <= j2; m2 += 1) {
-            bb2 = 2 * m2 - j2;
-            m = (aa2 + bb2 + j) / 2;
-
-            if(m < 0 || m > j) continue;
-
-            mtostr(strbb, j2, m2);
-            mtostr(strcc, j, m);
-
-            fprintf(file, "%s\t%s\t%s\t%s\t%s\t%s\t%g\n",
-                    stra, straa, strb, strbb, strc, strcc,
-                    cgarray[j1][j2][j][m1][m2]);
-          }
-        }
-      }
-    }
-  }
-}
-
 /* ---------------------------------------------------------------------- */
 
 int SNA::compute_ncoeff()
@@ -1663,25 +1556,10 @@ int SNA::compute_ncoeff()
   ncount = 0;
 
   for (int j1 = 0; j1 <= twojmax; j1++)
-    if(diagonalstyle == 0) {
-      for (int j2 = 0; j2 <= j1; j2++)
-        for (int j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-          ncount++;
-    } else if(diagonalstyle == 1) {
-      int j2 = j1;
-
+    for (int j2 = 0; j2 <= j1; j2++)
       for (int j = abs(j1 - j2);
-          j <= MIN(twojmax, j1 + j2); j += 2)
-        ncount++;
-    } else if(diagonalstyle == 2) {
-      ncount++;
-    } else if(diagonalstyle == 3) {
-      for (int j2 = 0; j2 <= j1; j2++)
-        for (int j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-          if (j >= j1) ncount++;
-    }
+           j <= MIN(twojmax, j1 + j2); j += 2)
+        if (j >= j1) ncount++;
 
   return ncount;
 }
diff --git a/src/SNAP/sna.h b/src/SNAP/sna.h
index b93b0ac7b0..b54ad3482a 100644
--- a/src/SNAP/sna.h
+++ b/src/SNAP/sna.h
@@ -26,7 +26,6 @@ namespace LAMMPS_NS {
 
 struct SNA_ZINDICES {
   int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju;
-  double betaj;
 };
 
 struct SNA_BINDICES {
@@ -51,19 +50,20 @@ public:
   void compute_ui(int);
   void compute_zi();
   void compute_yi(const double*);
+  void compute_yterm(int, int, int, const double*);
   void compute_bi();
-  void copy_bi2bvec();
 
   // functions for derivatives
 
   void compute_duidrj(double*, double, double);
   void compute_dbidrj();
   void compute_deidrj(double*);
-  void copy_dbi2dbvec();
   double compute_sfac(double, double);
   double compute_dsfac(double, double);
 
   double* bvec, ** dbvec;
+  double* blist;
+  double** dblist;
   double** rij;
   int* inside;
   double* wj;
@@ -73,31 +73,17 @@ public:
   void grow_rij(int);
 
   int twojmax, diagonalstyle;
-  double*** uarraytot_r, *** uarraytot_i;
-  double***** zarray_r, ***** zarray_i;
-  double*** yarray_r, *** yarray_i;
-  double*** uarray_r, *** uarray_i;
 
 private:
   double rmin0, rfac0;
 
-  // use indexlist instead of loops, constructor generates these
+  // data for bispectrum coefficients
 
   SNA_ZINDICES* idxz;
   SNA_BINDICES* idxb;
   int idxcg_max, idxu_max, idxz_max, idxb_max;
 
-  // data for bispectrum coefficients
-
-  double***** cgarray;
   double** rootpqarray;
-  double*** barray;
-
-  // derivatives of data
-
-  double**** duarray_r, **** duarray_i;
-  double**** dbarray;
-
   double* cglist;  
   int*** idxcg_block; 
 
@@ -121,9 +107,6 @@ private:
   void destroy_twojmax_arrays();
   void init_clebsch_gordan();
   void init_rootpqarray();
-  void jtostr(char*, int);
-  void mtostr(char*, int, int);
-  void print_clebsch_gordan(FILE*);
   void zero_uarraytot();
   void addself_uarraytot(double);
   void add_uarraytot(double, double, double);

From a973700295d264b09503728f805d451c054470da Mon Sep 17 00:00:00 2001
From: Aidan Thompson <athomps@sandia.gov>
Date: Wed, 12 Jun 2019 16:42:28 -0600
Subject: [PATCH 13/21] Completed back-porting of Zombie SNAP improvements,
 particularly noteworthy is reduction in memory footprint, elimination of most
 multidimensional arrays, elimination of diagonal_style, elimination of Z
 array in force calculation.

---
 doc/src/compute_sna_atom.txt       | 39 ++++++---------
 doc/src/pair_snap.txt              |  8 +--
 potentials/Ta06A.snapparam         |  1 -
 potentials/W_2940_2017_2.snapparam |  1 -
 src/SNAP/compute_sna_atom.cpp      | 19 ++-----
 src/SNAP/compute_sna_atom.h        |  2 +-
 src/SNAP/compute_snad_atom.cpp     | 21 +++-----
 src/SNAP/compute_snad_atom.h       |  2 +-
 src/SNAP/compute_snav_atom.cpp     | 21 ++------
 src/SNAP/compute_snav_atom.h       |  2 +-
 src/SNAP/pair_snap.cpp             | 75 +++++++++++++++++-----------
 src/SNAP/pair_snap.h               |  4 +-
 src/SNAP/sna.cpp                   | 80 +++++++++++++++---------------
 src/SNAP/sna.h                     |  9 ++--
 14 files changed, 130 insertions(+), 154 deletions(-)

diff --git a/doc/src/compute_sna_atom.txt b/doc/src/compute_sna_atom.txt
index efbf2e9ea3..10e68f5698 100644
--- a/doc/src/compute_sna_atom.txt
+++ b/doc/src/compute_sna_atom.txt
@@ -24,12 +24,8 @@ twojmax = band limit for bispectrum components (non-negative integer) :l
 R_1, R_2,... = list of cutoff radii, one for each type (distance units) :l
 w_1, w_2,... = list of neighbor weights, one for each type  :l
 zero or more keyword/value pairs may be appended :l
-keyword = {diagonal} or {rmin0} or {switchflag} or {bzeroflag} or {quadraticflag} :l
-  {diagonal} value = {0} or {1} or {2} or {3}
-     {0} = all j1, j2, j <= twojmax, j2 <= j1
-     {1} = subset satisfying j1 == j2
-     {2} = subset satisfying j1 == j2 == j3
-     {3} = subset satisfying j2 <= j1 <= j
+keyword = {rmin0} or {switchflag} or {bzeroflag} or {quadraticflag} :l
+//     {3} = subset satisfying j2 <= j1 <= j
   {rmin0} value = parameter in distance to angle conversion (distance units)
   {switchflag} value = {0} or {1}
      {0} = do not use switching function
@@ -44,7 +40,7 @@ keyword = {diagonal} or {rmin0} or {switchflag} or {bzeroflag} or {quadraticflag
 
 [Examples:]
 
-compute b all sna/atom 1.4 0.99363 6 2.0 2.4 0.75 1.0 diagonal 3 rmin0 0.0
+compute b all sna/atom 1.4 0.99363 6 2.0 2.4 0.75 1.0 rmin0 0.0
 compute db all sna/atom 1.4 0.95 6 2.0 1.0
 compute vb all sna/atom 1.4 0.95 6 2.0 1.0 :pre
 
@@ -151,7 +147,7 @@ The argument {rfac0} and the optional keyword {rmin0} define the
 linear mapping from radial distance to polar angle {theta0} on the
 3-sphere.
 
-The argument {twojmax} and the keyword {diagonal} define which
+The argument {twojmax} defines which
 bispectrum components are generated. See section below on output for a
 detailed explanation of the number of bispectrum components and the
 ordered in which they are listed.
@@ -192,25 +188,20 @@ command that includes all pairs in the neighbor list.
 Compute {sna/atom} calculates a per-atom array, each column
 corresponding to a particular bispectrum component.  The total number
 of columns and the identity of the bispectrum component contained in
-each column depend on the values of {twojmax} and {diagonal}, as
+each column depend of the value of {twojmax}, as
 described by the following piece of python code:
 
 for j1 in range(0,twojmax+1):
-    if(diagonal==2):
-        print j1/2.,j1/2.,j1/2.
-    elif(diagonal==1):
-        for j in range(0,min(twojmax,2*j1)+1,2):
-            print j1/2.,j1/2.,j/2.
-    elif(diagonal==0):
-        for j2 in range(0,j1+1):
-            for j in range(j1-j2,min(twojmax,j1+j2)+1,2):
-                print j1/2.,j2/2.,j/2.
-    elif(diagonal==3):
-        for j2 in range(0,j1+1):
-            for j in range(j1-j2,min(twojmax,j1+j2)+1,2):
-                if (j>=j1): print j1/2.,j2/2.,j/2. :pre
+    for j2 in range(0,j1+1):
+        for j in range(j1-j2,min(twojmax,j1+j2)+1,2):
+            if (j>=j1): print j1/2.,j2/2.,j/2. :pre
 
-Compute {snad/atom} evaluates a per-atom array. The columns are
+NOTE: the {diagonal} keyword allowing other possible choices
+for the number of bispectrum components was removed in 2019, 
+since all potentials use the value of 3, corresponding to the
+above set of bispectrum components.
+
+ompute {snad/atom} evaluates a per-atom array. The columns are
 arranged into {ntypes} blocks, listed in order of atom type {I}.  Each
 block contains three sub-blocks corresponding to the {x}, {y}, and {z}
 components of the atom position.  Each of these sub-blocks contains
@@ -259,7 +250,7 @@ package"_Build_package.html doc page for more info.
 
 [Default:]
 
-The optional keyword defaults are {diagonal} = 0, {rmin0} = 0,
+The optional keyword defaults are {rmin0} = 0,
 {switchflag} = 1, {bzeroflag} = 1, {quadraticflag} = 0,
 
 :line
diff --git a/doc/src/pair_snap.txt b/doc/src/pair_snap.txt
index a796cfdeba..1fba74a188 100644
--- a/doc/src/pair_snap.txt
+++ b/doc/src/pair_snap.txt
@@ -38,7 +38,7 @@ where {B_k^i} is the {k}-th bispectrum component of atom {i},
 and {beta_k^alpha_i} is the corresponding linear coefficient
 that depends on {alpha_i}, the SNAP element of atom {i}. The
 number of bispectrum components used and their definitions
-depend on the values of {twojmax} and {diagonalstyle}
+depend on the value of {twojmax}
 defined in the SNAP parameter file described below.
 The bispectrum calculation is described in more detail
 in "compute sna/atom"_compute_sna_atom.html.
@@ -125,14 +125,13 @@ This line is followed by {ncoeff} coefficients, one per line.
 The SNAP parameter file can contain blank and comment lines (start
 with #) anywhere. Each non-blank non-comment line must contain one
 keyword/value pair. The required keywords are {rcutfac} and
-{twojmax}. Optional keywords are {rfac0}, {rmin0}, {diagonalstyle},
+{twojmax}. Optional keywords are {rfac0}, {rmin0},
 {switchflag}, and {bzeroflag}.
 
 The default values for these keywords are
 
 {rfac0} = 0.99363
 {rmin0} = 0.0
-{diagonalstyle} = 3
 {switchflag} = 0
 {bzeroflag} = 1
 {quadraticflag} = 1 :ul
@@ -144,6 +143,9 @@ If {quadraticflag} is set to 1, then the SNAP energy expression includes the qua
 The SNAP element file should contain {K}({K}+1)/2 additional coefficients
 for each element, the upper-triangular elements of alpha.
 
+NOTE: The previously used {diagonalstyle} keyword was removed in 2019,
+since all known SNAP potentials use the default value of 3.
+
 :line
 
 [Mixing, shift, table, tail correction, restart, rRESPA info]:
diff --git a/potentials/Ta06A.snapparam b/potentials/Ta06A.snapparam
index 283629d658..629d96d708 100644
--- a/potentials/Ta06A.snapparam
+++ b/potentials/Ta06A.snapparam
@@ -10,6 +10,5 @@ twojmax 6
 
 rfac0 0.99363
 rmin0 0
-diagonalstyle 3
 bzeroflag 0
 quadraticflag 0
diff --git a/potentials/W_2940_2017_2.snapparam b/potentials/W_2940_2017_2.snapparam
index 27ab61a266..49f3094d08 100644
--- a/potentials/W_2940_2017_2.snapparam
+++ b/potentials/W_2940_2017_2.snapparam
@@ -8,6 +8,5 @@ twojmax 8
 
 rfac0 0.99363
 rmin0 0
-diagonalstyle 3
 bzeroflag 0
 quadraticflag 0
diff --git a/src/SNAP/compute_sna_atom.cpp b/src/SNAP/compute_sna_atom.cpp
index fea37faca0..cc7a84281e 100644
--- a/src/SNAP/compute_sna_atom.cpp
+++ b/src/SNAP/compute_sna_atom.cpp
@@ -44,7 +44,6 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
 
   // default values
 
-  diagonalstyle = 0;
   rmin0 = 0.0;
   switchflag = 1;
   bzeroflag = 1;
@@ -84,14 +83,7 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
   int iarg = nargmin;
 
   while (iarg < narg) {
-    if (strcmp(arg[iarg],"diagonal") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute sna/atom command");
-      diagonalstyle = atoi(arg[iarg+1]);
-      if (diagonalstyle < 0 || diagonalstyle > 3)
-        error->all(FLERR,"Illegal compute sna/atom command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"rmin0") == 0) {
+    if (strcmp(arg[iarg],"rmin0") == 0) {
       if (iarg+2 > narg)
         error->all(FLERR,"Illegal compute sna/atom command");
       rmin0 = atof(arg[iarg+1]);
@@ -114,7 +106,7 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
     } else error->all(FLERR,"Illegal compute sna/atom command");
   }
 
-  snaptr = new SNA(lmp,rfac0,twojmax,diagonalstyle,
+  snaptr = new SNA(lmp,rfac0,twojmax,
                    rmin0,switchflag,bzeroflag);
 
   ncoeff = snaptr->ncoeff;
@@ -123,7 +115,6 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
   peratom_flag = 1;
 
   nmax = 0;
-  njmax = 0;
   sna = NULL;
 }
 
@@ -277,9 +268,9 @@ void ComputeSNAAtom::compute_peratom()
 
 double ComputeSNAAtom::memory_usage()
 {
-  double bytes = nmax*size_peratom_cols * sizeof(double);
-  bytes += 3*njmax*sizeof(double);
-  bytes += njmax*sizeof(int);
+  double bytes = nmax*size_peratom_cols * sizeof(double); // sna
+  bytes += snaptr->memory_usage();                        // SNA object
+
   return bytes;
 }
 
diff --git a/src/SNAP/compute_sna_atom.h b/src/SNAP/compute_sna_atom.h
index 56ffccfa7e..105a62a37a 100644
--- a/src/SNAP/compute_sna_atom.h
+++ b/src/SNAP/compute_sna_atom.h
@@ -34,7 +34,7 @@ class ComputeSNAAtom : public Compute {
   double memory_usage();
 
  private:
-  int nmax, njmax, diagonalstyle;
+  int nmax;
   int ncoeff;
   double **cutsq;
   class NeighList *list;
diff --git a/src/SNAP/compute_snad_atom.cpp b/src/SNAP/compute_snad_atom.cpp
index 156380eccc..37587a0aae 100644
--- a/src/SNAP/compute_snad_atom.cpp
+++ b/src/SNAP/compute_snad_atom.cpp
@@ -44,7 +44,6 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
 
   // default values
 
-  diagonalstyle = 0;
   rmin0 = 0.0;
   switchflag = 1;
   bzeroflag = 1;
@@ -82,14 +81,7 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
   int iarg = nargmin;
 
   while (iarg < narg) {
-    if (strcmp(arg[iarg],"diagonal") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snad/atom command");
-      diagonalstyle = atof(arg[iarg+1]);
-      if (diagonalstyle < 0 || diagonalstyle > 3)
-        error->all(FLERR,"Illegal compute snad/atom command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"rmin0") == 0) {
+    if (strcmp(arg[iarg],"rmin0") == 0) {
       if (iarg+2 > narg)
         error->all(FLERR,"Illegal compute snad/atom command");
       rmin0 = atof(arg[iarg+1]);
@@ -112,7 +104,7 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
     } else error->all(FLERR,"Illegal compute snad/atom command");
   }
 
-  snaptr = new SNA(lmp,rfac0,twojmax,diagonalstyle,
+  snaptr = new SNA(lmp,rfac0,twojmax,
                    rmin0,switchflag,bzeroflag);
 
   ncoeff = snaptr->ncoeff;
@@ -125,7 +117,6 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
   peratom_flag = 1;
 
   nmax = 0;
-  njmax = 0;
   snad = NULL;
 }
 
@@ -378,9 +369,9 @@ void ComputeSNADAtom::unpack_reverse_comm(int n, int *list, double *buf)
 
 double ComputeSNADAtom::memory_usage()
 {
-  double bytes = nmax*size_peratom_cols * sizeof(double);
-  bytes += 3*njmax*sizeof(double);
-  bytes += njmax*sizeof(int);
-  bytes += 3*nperdim*atom->ntypes;
+
+  double bytes = nmax*size_peratom_cols * sizeof(double); // snad
+  bytes += snaptr->memory_usage();                        // SNA object
+
   return bytes;
 }
diff --git a/src/SNAP/compute_snad_atom.h b/src/SNAP/compute_snad_atom.h
index 1fcf540d7c..ac353d8553 100644
--- a/src/SNAP/compute_snad_atom.h
+++ b/src/SNAP/compute_snad_atom.h
@@ -36,7 +36,7 @@ class ComputeSNADAtom : public Compute {
   double memory_usage();
 
  private:
-  int nmax, njmax, diagonalstyle;
+  int nmax;
   int ncoeff, nperdim, yoffset, zoffset;
   double **cutsq;
   class NeighList *list;
diff --git a/src/SNAP/compute_snav_atom.cpp b/src/SNAP/compute_snav_atom.cpp
index 6caff0820c..1f702496ed 100644
--- a/src/SNAP/compute_snav_atom.cpp
+++ b/src/SNAP/compute_snav_atom.cpp
@@ -44,7 +44,6 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
 
   // default values
 
-  diagonalstyle = 0;
   rmin0 = 0.0;
   switchflag = 1;
   bzeroflag = 1;
@@ -78,14 +77,7 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
   int iarg = nargmin;
 
   while (iarg < narg) {
-    if (strcmp(arg[iarg],"diagonal") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snav/atom command");
-      diagonalstyle = atof(arg[iarg+1]);
-      if (diagonalstyle < 0 || diagonalstyle > 3)
-        error->all(FLERR,"Illegal compute snav/atom command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"rmin0") == 0) {
+    if (strcmp(arg[iarg],"rmin0") == 0) {
       if (iarg+2 > narg)
         error->all(FLERR,"Illegal compute snav/atom command");
       rmin0 = atof(arg[iarg+1]);
@@ -108,7 +100,7 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
     } else error->all(FLERR,"Illegal compute snav/atom command");
   }
 
-  snaptr = new SNA(lmp,rfac0,twojmax,diagonalstyle,
+  snaptr = new SNA(lmp,rfac0,twojmax,
                    rmin0,switchflag,bzeroflag);
 
   ncoeff = snaptr->ncoeff;
@@ -119,7 +111,6 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
   peratom_flag = 1;
 
   nmax = 0;
-  njmax = 0;
   snav = NULL;
 
 }
@@ -389,10 +380,8 @@ void ComputeSNAVAtom::unpack_reverse_comm(int n, int *list, double *buf)
 
 double ComputeSNAVAtom::memory_usage()
 {
-  double bytes = nmax*size_peratom_cols * sizeof(double);
-  bytes += 3*njmax*sizeof(double);
-  bytes += njmax*sizeof(int);
-  bytes += 6*nperdim*atom->ntypes;
-  if (quadraticflag) bytes += 6*nperdim*atom->ntypes;
+  double bytes = nmax*size_peratom_cols * sizeof(double); // snav
+  bytes += snaptr->memory_usage();                        // SNA object
+
   return bytes;
 }
diff --git a/src/SNAP/compute_snav_atom.h b/src/SNAP/compute_snav_atom.h
index 6bcce346e0..9df17cc667 100644
--- a/src/SNAP/compute_snav_atom.h
+++ b/src/SNAP/compute_snav_atom.h
@@ -36,7 +36,7 @@ class ComputeSNAVAtom : public Compute {
   double memory_usage();
 
  private:
-  int nmax, njmax, diagonalstyle;
+  int nmax;
   int ncoeff, nperdim;
   double **cutsq;
   class NeighList *list;
diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp
index 6a65f872fd..6eb05f85a4 100644
--- a/src/SNAP/pair_snap.cpp
+++ b/src/SNAP/pair_snap.cpp
@@ -34,10 +34,6 @@ using namespace LAMMPS_NS;
 #define MAXLINE 1024
 #define MAXWORD 3
 
-// Outstanding issues with quadratic term
-// 1. there seems to a problem with compute_optimized energy calc
-// it does not match compute_regular, even when quadratic coeffs = 0
-
 /* ---------------------------------------------------------------------- */
 
 PairSNAP::PairSNAP(LAMMPS *lmp) : Pair(lmp)
@@ -53,8 +49,6 @@ PairSNAP::PairSNAP(LAMMPS *lmp) : Pair(lmp)
   wjelem = NULL;
   coeffelem = NULL;
 
-  nmax = 0;
-
   beta_max = 0;
   beta = NULL;
   bispectrum = NULL;
@@ -74,6 +68,7 @@ PairSNAP::~PairSNAP()
     memory->destroy(wjelem);
     memory->destroy(coeffelem);
   }
+
   memory->destroy(beta);
   memory->destroy(bispectrum);
 
@@ -115,7 +110,8 @@ void PairSNAP::compute(int eflag, int vflag)
 
   // compute dE_i/dB_i = beta_i for all i in list
 
-  compute_bispectrum();
+  if (quadraticflag || eflag) 
+    compute_bispectrum();
   compute_beta();
 
   numneigh = list->numneigh;
@@ -209,15 +205,25 @@ void PairSNAP::compute(int eflag, int vflag)
       evdwl = coeffi[0];
 
       // E = beta.B + 0.5*B^t.alpha.B
-      // coeff[k] = beta[k-1] or
-      // coeff[k] = alpha_ii or
-      // coeff[k] = alpha_ij = alpha_ji, j != i
 
       // linear contributions
 
-      for (int k = 0; k < ncoeff; k++)
-        evdwl += beta[ii][k]*bispectrum[ii][k];
+      for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+        evdwl += coeffi[icoeff+1]*bispectrum[ii][icoeff];
 
+      // quadratic contributions
+
+      if (quadraticflag) {
+        int k = ncoeff+1;
+        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+          double bveci = bispectrum[ii][icoeff];
+          evdwl += 0.5*coeffi[k++]*bveci*bveci;
+          for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+            double bvecj = bispectrum[ii][jcoeff];
+            evdwl += coeffi[k++]*bveci*bvecj;
+          }
+        }
+      }
       ev_tally_full(i,2.0*evdwl,0.0,0.0,0.0,0.0,0.0);
     }
 
@@ -241,8 +247,23 @@ void PairSNAP::compute_beta()
     const int ielem = map[itype];
     double* coeffi = coeffelem[ielem];
 
-    for (int k = 1; k <= ncoeff; k++)
-      beta[ii][k-1] = coeffi[k];
+    for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+      beta[ii][icoeff] = coeffi[icoeff+1];
+
+    if (quadraticflag) {
+      int k = ncoeff+1;
+      for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+        double bveci = bispectrum[ii][icoeff];
+        beta[ii][icoeff] += coeffi[k]*bveci;
+        k++;
+        for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+          double bvecj = bispectrum[ii][jcoeff];
+          beta[ii][icoeff] += coeffi[k]*bvecj;
+          beta[ii][jcoeff] += coeffi[k]*bveci;
+          k++;
+        }
+      }
+    }
   }
 }
 
@@ -308,8 +329,8 @@ void PairSNAP::compute_bispectrum()
     snaptr->compute_zi();
     snaptr->compute_bi();
 
-    for (int k = 0; k < ncoeff; k++)
-      bispectrum[ii][k] = snaptr->blist[k];
+    for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+      bispectrum[ii][icoeff] = snaptr->blist[icoeff];
   }
 }
 
@@ -354,8 +375,6 @@ void PairSNAP::coeff(int narg, char **arg)
     memory->destroy(wjelem);
     memory->destroy(coeffelem);
   }
-  memory->destroy(beta);
-  memory->destroy(bispectrum);
 
   char* type1 = arg[0];
   char* type2 = arg[1];
@@ -425,9 +444,7 @@ void PairSNAP::coeff(int narg, char **arg)
   if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 
   snaptr = new SNA(lmp,rfac0,twojmax,
-                   diagonalstyle,
                    rmin0,switchflag,bzeroflag);
-  snaptr->grow_rij(nmax);
 
   if (ncoeff != snaptr->ncoeff) {
     if (comm->me == 0)
@@ -617,7 +634,6 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename)
 
   rfac0 = 0.99363;
   rmin0 = 0.0;
-  diagonalstyle = 3;
   switchflag = 1;
   bzeroflag = 1;
   quadraticflag = 0;
@@ -678,8 +694,6 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename)
       rfac0 = atof(keyval);
     else if (strcmp(keywd,"rmin0") == 0)
       rmin0 = atof(keyval);
-    else if (strcmp(keywd,"diagonalstyle") == 0)
-      diagonalstyle = atoi(keyval);
     else if (strcmp(keywd,"switchflag") == 0)
       switchflag = atoi(keyval);
     else if (strcmp(keywd,"bzeroflag") == 0)
@@ -702,13 +716,16 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename)
 double PairSNAP::memory_usage()
 {
   double bytes = Pair::memory_usage();
+
   int n = atom->ntypes+1;
-  bytes += n*n*sizeof(int);
-  bytes += n*n*sizeof(double);
-  bytes += 3*nmax*sizeof(double);
-  bytes += nmax*sizeof(int);
-  bytes += (2*ncoeffall)*sizeof(double);
-  bytes += (ncoeff*3)*sizeof(double);
+  bytes += n*n*sizeof(int);      // setflag
+  bytes += n*n*sizeof(double);   // cutsq
+  bytes += n*sizeof(int);        // map
+  bytes += beta_max*ncoeff*sizeof(double); // bispectrum
+  bytes += beta_max*ncoeff*sizeof(double); // beta
+
+  bytes += snaptr->memory_usage(); // SNA object
+
   return bytes;
 }
 
diff --git a/src/SNAP/pair_snap.h b/src/SNAP/pair_snap.h
index b5871c1527..c64eaa5d4e 100644
--- a/src/SNAP/pair_snap.h
+++ b/src/SNAP/pair_snap.h
@@ -40,9 +40,7 @@ public:
 
 protected:
   int ncoeffq, ncoeffall;
-  double **bvec, ***dbvec;
   class SNA* snaptr;
-  int nmax;
   virtual void allocate();
   void read_files(char *, char *);
   inline int equal(double* x,double* y);
@@ -60,7 +58,7 @@ protected:
   double** beta;                // betas for all atoms in list
   double** bispectrum;          // bispectrum components for all atoms in list
   int *map;                     // mapping from atom types to elements
-  int twojmax, diagonalstyle, switchflag, bzeroflag;
+  int twojmax, switchflag, bzeroflag;
   double rfac0, rmin0, wj1, wj2;
   int rcutfacflag, twojmaxflag; // flags for required parameters
   int beta_max;                 // length of beta
diff --git a/src/SNAP/sna.cpp b/src/SNAP/sna.cpp
index 131ac48fdb..75601b8e17 100644
--- a/src/SNAP/sna.cpp
+++ b/src/SNAP/sna.cpp
@@ -113,7 +113,7 @@ using namespace MathConst;
 ------------------------------------------------------------------------- */
 
 SNA::SNA(LAMMPS* lmp, double rfac0_in,
-         int twojmax_in, int diagonalstyle_in,
+         int twojmax_in,
          double rmin0_in, int switch_flag_in, int bzero_flag_in) : Pointers(lmp)
 {
   wself = 1.0;
@@ -124,21 +124,16 @@ SNA::SNA(LAMMPS* lmp, double rfac0_in,
   bzero_flag = bzero_flag_in;
 
   twojmax = twojmax_in;
-  diagonalstyle = diagonalstyle_in;
 
   ncoeff = compute_ncoeff();
 
-  bvec = NULL;
-  dbvec = NULL;
-  memory->create(bvec, ncoeff, "pair:bvec");
-  memory->create(dbvec, ncoeff, 3, "pair:dbvec");
   rij = NULL;
   inside = NULL;
   wj = NULL;
   rcutij = NULL;
   nmax = 0;
   idxz = NULL;
-  idxb= NULL;
+  idxb = NULL;
 
   build_indexlist();
   create_twojmax_arrays();
@@ -159,8 +154,6 @@ SNA::~SNA()
   memory->destroy(inside);
   memory->destroy(wj);
   memory->destroy(rcutij);
-  memory->destroy(bvec);
-  memory->destroy(dbvec);
   delete[] idxz;
   delete[] idxb;
   destroy_twojmax_arrays();
@@ -168,8 +161,6 @@ SNA::~SNA()
 
 void SNA::build_indexlist()
 {
-  if(diagonalstyle != 3)
-    error->all(FLERR, "diagonal_style must be 3\n");
 
   // index list for cglist
 
@@ -180,7 +171,7 @@ void SNA::build_indexlist()
   int idxcg_count = 0;
   for(int j1 = 0; j1 <= twojmax; j1++)
     for(int j2 = 0; j2 <= j1; j2++)
-      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
         idxcg_block[j1][j2][j] = idxcg_count; 
         for (int m1 = 0; m1 <= j1; m1++)
           for (int m2 = 0; m2 <= j2; m2++)
@@ -209,7 +200,7 @@ void SNA::build_indexlist()
   int idxb_count = 0;  
   for(int j1 = 0; j1 <= twojmax; j1++)
     for(int j2 = 0; j2 <= j1; j2++)
-      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
+      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
         if (j >= j1) idxb_count++;
   
   idxb_max = idxb_count;
@@ -218,7 +209,7 @@ void SNA::build_indexlist()
   idxb_count = 0;
   for(int j1 = 0; j1 <= twojmax; j1++)
     for(int j2 = 0; j2 <= j1; j2++)
-      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
+      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
         if (j >= j1) {
           idxb[idxb_count].j1 = j1;
           idxb[idxb_count].j2 = j2;
@@ -233,7 +224,7 @@ void SNA::build_indexlist()
   idxb_count = 0;
   for(int j1 = 0; j1 <= twojmax; j1++)
     for(int j2 = 0; j2 <= j1; j2++)
-      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
         if (j >= j1) {
           idxb_block[j1][j2][j] = idxb_count; 
           idxb_count++;
@@ -246,7 +237,7 @@ void SNA::build_indexlist()
 
   for(int j1 = 0; j1 <= twojmax; j1++)
     for(int j2 = 0; j2 <= j1; j2++)
-      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
+      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
         for (int mb = 0; 2*mb <= j; mb++)
           for (int ma = 0; ma <= j; ma++)
             idxz_count++;
@@ -260,7 +251,7 @@ void SNA::build_indexlist()
   idxz_count = 0;
   for(int j1 = 0; j1 <= twojmax; j1++)
     for(int j2 = 0; j2 <= j1; j2++)
-      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
         idxz_block[j1][j2][j] = idxz_count; 
 
         // find right beta[jjb] entry
@@ -288,6 +279,7 @@ void SNA::build_indexlist()
           }
       }
 }
+
 /* ---------------------------------------------------------------------- */
 
 void SNA::init()
@@ -312,6 +304,7 @@ void SNA::grow_rij(int newnmax)
   memory->create(wj, nmax, "pair:wj");
   memory->create(rcutij, nmax, "pair:rcutij");
 }
+
 /* ----------------------------------------------------------------------
    compute Ui by summing over neighbors j
 ------------------------------------------------------------------------- */
@@ -401,12 +394,6 @@ void SNA::compute_zi()
       icgb += j2;
     } // end loop over ib
 
-//     // apply symmetry factor
- 
-//     const double jfac = 1.0/(j+1);
-//     zlist_r[jjz] *= jfac;
-//     zlist_i[jjz] *= jfac;
-    
   } // end loop over jjz
 }
 
@@ -631,6 +618,11 @@ void SNA::compute_bi()
     } // end if jeven
 
     blist[jjb] = 2.0*sumzu;
+
+    // apply bzero shift
+
+    if (bzero_flag)
+      blist[jjb] -= bzero[j];
   }
 }
 
@@ -1186,25 +1178,38 @@ double SNA::memory_usage()
   int jdimpq = twojmax + 2;
   int jdim = twojmax + 1;
   double bytes;
-  bytes = ncoeff * sizeof(double);                       // coeff
+
+  bytes = 0;
 
   bytes += jdimpq*jdimpq * sizeof(double);               // pqarray
   bytes += idxcg_max * sizeof(double);                   // cglist
-  bytes += jdim * jdim * jdim * sizeof(int);                // idxcg_block
 
   bytes += idxu_max * sizeof(double) * 2;                // ulist
   bytes += idxu_max * sizeof(double) * 2;                // ulisttot
   bytes += idxu_max * 3 * sizeof(double) * 2;            // dulist
-  bytes += jdim * sizeof(int);                              // idxu_block
 
-  bytes += idxz_max * 9 * sizeof(int);                      // idxz
   bytes += idxz_max * sizeof(double) * 2;                // zlist
-  bytes += jdim * jdim * jdim * sizeof(int);                // idxz_block
-
+  bytes += idxb_max * sizeof(double);                    // blist
+  bytes += idxb_max * 3 * sizeof(double);                // dblist
   bytes += idxu_max * sizeof(double) * 2;                // ylist
-  bytes += idxb_max * 3 * sizeof(int);                      // idxb
 
-  bytes += jdim * jdim * jdim * sizeof(int);                // idxb_block
+  bytes += jdim * jdim * jdim * sizeof(int);             // idxcg_block
+  bytes += jdim * sizeof(int);                           // idxu_block
+  bytes += jdim * jdim * jdim * sizeof(int);             // idxz_block
+  bytes += jdim * jdim * jdim * sizeof(int);             // idxb_block
+
+  bytes += idxz_max * sizeof(SNA_ZINDICES);              // idxz
+  bytes += idxb_max * sizeof(SNA_BINDICES);              // idxb
+
+  bytes += jdim * sizeof(double);                        // bzero
+
+  bytes += nmax * 3 * sizeof(double);                    // rij
+  bytes += nmax * sizeof(int);                           // inside
+  bytes += nmax * sizeof(double);                        // wj
+  bytes += nmax * sizeof(double);                        // rcutij
+
+  printf("SNAP Z list Memory Usage %d\n",idxz_max * sizeof(double) * 2);
+  printf("SNAP CG list Memory Usage %d\n",idxcg_max * sizeof(double));
 
   return bytes;
 }
@@ -1242,25 +1247,22 @@ void SNA::destroy_twojmax_arrays()
 {
   memory->destroy(rootpqarray);
   memory->destroy(cglist);
-  memory->destroy(idxcg_block);
-
   memory->destroy(ulist_r);
   memory->destroy(ulist_i);
   memory->destroy(ulisttot_r);
   memory->destroy(ulisttot_i);
   memory->destroy(dulist_r);
   memory->destroy(dulist_i);
-  memory->destroy(idxu_block);
-
   memory->destroy(zlist_r);
   memory->destroy(zlist_i);
   memory->destroy(blist);
   memory->destroy(dblist);
-  memory->destroy(idxz_block);
-
   memory->destroy(ylist_r);
   memory->destroy(ylist_i);
 
+  memory->destroy(idxcg_block);
+  memory->destroy(idxu_block);
+  memory->destroy(idxz_block);
   memory->destroy(idxb_block);
 
   if (bzero_flag)
@@ -1484,7 +1486,7 @@ void SNA::init_clebsch_gordan()
   int idxcg_count = 0;
   for(int j1 = 0; j1 <= twojmax; j1++)
     for(int j2 = 0; j2 <= j1; j2++)
-      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
         for (int m1 = 0; m1 <= j1; m1++) {
           aa2 = 2 * m1 - j1;
 
@@ -1557,7 +1559,7 @@ int SNA::compute_ncoeff()
 
   for (int j1 = 0; j1 <= twojmax; j1++)
     for (int j2 = 0; j2 <= j1; j2++)
-      for (int j = abs(j1 - j2);
+      for (int j = j1 - j2;
            j <= MIN(twojmax, j1 + j2); j += 2)
         if (j >= j1) ncount++;
 
diff --git a/src/SNAP/sna.h b/src/SNAP/sna.h
index b54ad3482a..1e08ef123c 100644
--- a/src/SNAP/sna.h
+++ b/src/SNAP/sna.h
@@ -18,9 +18,7 @@
 #ifndef LMP_SNA_H
 #define LMP_SNA_H
 
-#include <complex>
 #include "pointers.h"
-#include <ctime>
 
 namespace LAMMPS_NS {
 
@@ -35,7 +33,7 @@ struct SNA_BINDICES {
 class SNA : protected Pointers {
 
 public:
-  SNA(LAMMPS*, double, int, int, double, int, int);
+  SNA(LAMMPS*, double, int, double, int, int);
 
   SNA(LAMMPS* lmp) : Pointers(lmp) {};
   ~SNA();
@@ -61,7 +59,6 @@ public:
   double compute_sfac(double, double);
   double compute_dsfac(double, double);
 
-  double* bvec, ** dbvec;
   double* blist;
   double** dblist;
   double** rij;
@@ -72,7 +69,7 @@ public:
 
   void grow_rij(int);
 
-  int twojmax, diagonalstyle;
+  int twojmax;
 
 private:
   double rmin0, rfac0;
@@ -126,7 +123,7 @@ private:
   double wself;
 
   int bzero_flag; // 1 if bzero subtracted from barray
-  double *bzero;  // array of B values for isolated atoms
+  double* bzero;  // array of B values for isolated atoms
 };
 
 }

From 3f523ea906a417232a7be1e8d64778fef41041d0 Mon Sep 17 00:00:00 2001
From: athomps <athomps@sandia.gov>
Date: Wed, 12 Jun 2019 17:02:59 -0600
Subject: [PATCH 14/21] Not part of this pull request

---
 examples/snap/W.nnsnap | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 examples/snap/W.nnsnap

diff --git a/examples/snap/W.nnsnap b/examples/snap/W.nnsnap
deleted file mode 100644
index 6ca97a701a..0000000000
--- a/examples/snap/W.nnsnap
+++ /dev/null
@@ -1,16 +0,0 @@
-# DATE: 2017-02-20 CONTRIBUTOR: Mitchell Wood mitwood@sandia.gov CITATION: Wood, M. A. and Thompson, A. P. "Quantum-Accurate Molecular Dynamics Potential for Tungsten" arXiv:1702.07042 [physics.comp-ph]
-#
-# Definition of SNAP+ZBL potential.
-variable zblcutinner equal 4
-variable zblcutouter equal 4.8
-variable zblz equal 74
-
-# Specify hybrid with SNAP and ZBL
-
-pair_style hybrid/overlay &
-zbl ${zblcutinner} ${zblcutouter} &
-nn/snap
-pair_coeff 1 1 zbl ${zblz} ${zblz}
-pair_coeff * * nn/snap W_2940_2017_2.snapcoeff W_2940_2017_2.snapparam W
-
-#Nomenclature on the snap files are Element_DakotaID_Year_Month

From f8e257d21967470b0e4c6e47bd3830a3192e871a Mon Sep 17 00:00:00 2001
From: athomps <athomps@sandia.gov>
Date: Wed, 12 Jun 2019 17:04:05 -0600
Subject: [PATCH 15/21] Not part of this pull request

---
 src/SNAP/pair_nn_snap.cpp | 1824 -------------------------------------
 1 file changed, 1824 deletions(-)
 delete mode 100644 src/SNAP/pair_nn_snap.cpp

diff --git a/src/SNAP/pair_nn_snap.cpp b/src/SNAP/pair_nn_snap.cpp
deleted file mode 100644
index e90f6d6b1b..0000000000
--- a/src/SNAP/pair_nn_snap.cpp
+++ /dev/null
@@ -1,1824 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdlib>
-#include <cstring>
-#include "pair_nn_snap.h"
-#include "atom.h"
-#include "atom_vec.h"
-#include "force.h"
-#include "comm.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "sna.h"
-#include "openmp_snap.h"
-#include "domain.h"
-#include "memory.h"
-#include "error.h"
-
-#include <cmath>
-
-using namespace LAMMPS_NS;
-
-#define MAXLINE 1024
-#define MAXWORD 3
-
-// Outstanding issues with quadratic term
-// 1. there seems to a problem with compute_optimized energy calc
-// it does not match compute_regular, even when quadratic coeffs = 0
-
-/* ---------------------------------------------------------------------- */
-
-PairNNSNAP::PairNNSNAP(LAMMPS *lmp) : Pair(lmp)
-{
-  single_enable = 0;
-  restartinfo = 0;
-  one_coeff = 1;
-  manybody_flag = 1;
-
-  nelements = 0;
-  elements = NULL;
-  radelem = NULL;
-  wjelem = NULL;
-  coeffelem = NULL;
-
-  nmax = 0;
-  nthreads = 1;
-
-  schedule_user = 0;
-  schedule_time_guided = -1;
-  schedule_time_dynamic = -1;
-  ncalls_neigh =-1;
-
-  ilistmask_max = 0;
-  ilistmask = NULL;
-  ghostinum = 0;
-  ghostilist_max = 0;
-  ghostilist = NULL;
-  ghostnumneigh_max = 0;
-  ghostnumneigh = NULL;
-  ghostneighs = NULL;
-  ghostfirstneigh = NULL;
-  ghostneighs_total = 0;
-  ghostneighs_max = 0;
-
-  i_max = 0;
-  i_neighmax = 0;
-  i_numpairs = 0;
-  i_rij = NULL;
-  i_inside = NULL;
-  i_wj = NULL;
-  i_rcutij = NULL;
-  i_ninside = NULL;
-  i_pairs = NULL;
-  i_uarraytot_r = NULL;
-  i_uarraytot_i = NULL;
-  i_zarray_r = NULL;
-  i_zarray_i = NULL;
-
-  use_shared_arrays = 0;
-
-#ifdef TIMING_INFO
-  timers[0] = 0;
-  timers[1] = 0;
-  timers[2] = 0;
-  timers[3] = 0;
-#endif
-
-  // Need to set this because restart not handled by PairHybrid
-
-  sna = NULL;
-
-  beta_max = 0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-PairNNSNAP::~PairNNSNAP()
-{
-  if (copymode) return;
-
-  if (nelements) {
-    for (int i = 0; i < nelements; i++)
-      delete[] elements[i];
-    delete[] elements;
-    memory->destroy(radelem);
-    memory->destroy(wjelem);
-    memory->destroy(coeffelem);
-    memory->destroy(beta);
-  }
-
-  // Need to set this because restart not handled by PairHybrid
-
-  if (sna) {
-
-#ifdef TIMING_INFO
-    double time[5];
-    double timeave[5];
-    double timeave_mpi[5];
-    double timemax_mpi[5];
-
-    for (int i = 0; i < 5; i++) {
-      time[i] = 0;
-      timeave[i] = 0;
-      for (int tid = 0; tid<nthreads; tid++) {
-        if (sna[tid]->timers[i]>time[i])
-          time[i] = sna[tid]->timers[i];
-        timeave[i] += sna[tid]->timers[i];
-      }
-      timeave[i] /= nthreads;
-    }
-    MPI_Reduce(timeave, timeave_mpi, 5, MPI_DOUBLE, MPI_SUM, 0, world);
-    MPI_Reduce(time, timemax_mpi, 5, MPI_DOUBLE, MPI_MAX, 0, world);
-#endif
-
-    for (int tid = 0; tid<nthreads; tid++)
-      delete sna[tid];
-    delete [] sna;
-
-  }
-
-  if (allocated) {
-    memory->destroy(setflag);
-    memory->destroy(cutsq);
-    memory->destroy(map);
-  }
-
-}
-
-void PairNNSNAP::compute(int eflag, int vflag)
-{
-//   if (use_optimized)
-//     compute_optimized(eflag, vflag);
-//   else
-
-// hard-code compute_regular()
- 
-    compute_regular(eflag, vflag);
-}
-
-/* ----------------------------------------------------------------------
-   This version is a straightforward implementation
-   ---------------------------------------------------------------------- */
-
-void PairNNSNAP::compute_regular(int eflag, int vflag)
-{
-  int i,j,jnum,ninside;
-  double delx,dely,delz,evdwl,rsq;
-  double fij[3];
-  int *jlist,*numneigh,**firstneigh;
-  evdwl = 0.0;
-
-  ev_init(eflag,vflag);
-
-  double **x = atom->x;
-  double **f = atom->f;
-  int *type = atom->type;
-  int nlocal = atom->nlocal;
-  int newton_pair = force->newton_pair;
-  class SNA* snaptr = sna[0];
-
-  if (beta_max < list->inum) {
-    memory->grow(beta,list->inum,ncoeff,"PairNNSNAP:beta");
-    beta_max = list->inum;
-  }
-
-  // compute dE_i/dB_i = beta_i for all i in list
-
-  compute_beta();
-
-  numneigh = list->numneigh;
-  firstneigh = list->firstneigh;
-
-  for (int ii = 0; ii < list->inum; ii++) {
-    i = list->ilist[ii];
-
-    const double xtmp = x[i][0];
-    const double ytmp = x[i][1];
-    const double ztmp = x[i][2];
-    const int itype = type[i];
-    const int ielem = map[itype];
-    const double radi = radelem[ielem];
-
-    jlist = firstneigh[i];
-    jnum = numneigh[i];
-
-    // insure rij, inside, wj, and rcutij are of size jnum
-
-    snaptr->grow_rij(jnum);
-
-    // rij[][3] = displacements between atom I and those neighbors
-    // inside = indices of neighbors of I within cutoff
-    // wj = weights for neighbors of I within cutoff
-    // rcutij = cutoffs for neighbors of I within cutoff
-    // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi
-
-    ninside = 0;
-    for (int jj = 0; jj < jnum; jj++) {
-      j = jlist[jj];
-      j &= NEIGHMASK;
-      delx = x[j][0] - xtmp;
-      dely = x[j][1] - ytmp;
-      delz = x[j][2] - ztmp;
-      rsq = delx*delx + dely*dely + delz*delz;
-      int jtype = type[j];
-      int jelem = map[jtype];
-
-      if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-        snaptr->rij[ninside][0] = delx;
-        snaptr->rij[ninside][1] = dely;
-        snaptr->rij[ninside][2] = delz;
-        snaptr->inside[ninside] = j;
-        snaptr->wj[ninside] = wjelem[jelem];
-        snaptr->rcutij[ninside] = (radi + radelem[jelem])*rcutfac;
-        ninside++;
-      }
-    }
-
-    // compute Ui, Zi, and Bi for atom I
-
-    snaptr->compute_ui(ninside);
-    snaptr->compute_zi();
-    if (quadraticflag) {
-      snaptr->compute_bi();
-      snaptr->copy_bi2bvec();
-    }
-
-    // for neighbors of I within cutoff:
-    // compute Fij = dEi/dRj = -dEi/dRi 
-    // add to Fi, subtract from Fj
-
-    // compute beta_i*Z_i = Y_i
-
-    snaptr->compute_yi(beta[ii]);
-
-    for (int jj = 0; jj < ninside; jj++) {
-      int j = snaptr->inside[jj];
-      snaptr->compute_duidrj(snaptr->rij[jj],
-                             snaptr->wj[jj],snaptr->rcutij[jj]);
-
-//       // quadratic contributions
-
-//       if (quadraticflag) {
-//         int k = ncoeff+1;
-//         for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-//           double bveci = snaptr->bvec[icoeff];
-//           double fack = coeffi[k]*bveci;
-//           double* dbveci = snaptr->dbvec[icoeff];
-//           fij[0] += fack*dbveci[0];
-//           fij[1] += fack*dbveci[1];
-//           fij[2] += fack*dbveci[2];
-//           k++;
-//           for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-//             double facki = coeffi[k]*bveci;
-//             double fackj = coeffi[k]*snaptr->bvec[jcoeff];
-//             double* dbvecj = snaptr->dbvec[jcoeff];
-
-//             fij[0] += facki*dbvecj[0]+fackj*dbveci[0];
-//             fij[1] += facki*dbvecj[1]+fackj*dbveci[1];
-//             fij[2] += facki*dbvecj[2]+fackj*dbveci[2];
-//             k++;
-//           }
-//         }
-//       }
-
-      snaptr->compute_deidrj(fij);
-
-      f[i][0] += fij[0];
-      f[i][1] += fij[1];
-      f[i][2] += fij[2];
-      f[j][0] -= fij[0];
-      f[j][1] -= fij[1];
-      f[j][2] -= fij[2];
-
-      // tally per-atom virial contribution
-
-      if (vflag)
-        ev_tally_xyz(i,j,nlocal,newton_pair,0.0,0.0,
-                     fij[0],fij[1],fij[2],
-                     -snaptr->rij[jj][0],-snaptr->rij[jj][1],
-                     -snaptr->rij[jj][2]);
-    }
-
-    // tally energy contribution
-
-    if (eflag) {
-
-      // evdwl = energy of atom I, sum over coeffs_k * Bi_k
-
-      double* coeffi = coeffelem[ielem];
-      evdwl = coeffi[0];
-      if (!quadraticflag) {
-        snaptr->compute_bi();
-        snaptr->copy_bi2bvec();
-      }
-
-      // E = beta.B + 0.5*B^t.alpha.B
-      // coeff[k] = beta[k-1] or
-      // coeff[k] = alpha_ii or
-      // coeff[k] = alpha_ij = alpha_ji, j != i
-
-      // linear contributions
-
-      for (int k = 1; k <= ncoeff; k++)
-        evdwl += coeffi[k]*snaptr->bvec[k-1];
-
-      // quadratic contributions
-
-      if (quadraticflag) {
-        int k = ncoeff+1;
-        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          double bveci = snaptr->bvec[icoeff];
-          evdwl += 0.5*coeffi[k++]*bveci*bveci;
-          for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-            evdwl += coeffi[k++]*bveci*snaptr->bvec[jcoeff];
-          }
-        }
-      }
-      ev_tally_full(i,2.0*evdwl,0.0,0.0,0.0,0.0,0.0);
-    }
-
-  }
-
-  if (vflag_fdotr) virial_fdotr_compute();
-}
-
-
-/* ----------------------------------------------------------------------
-   This version is optimized for threading, micro-load balancing
-   ---------------------------------------------------------------------- */
-
-void PairNNSNAP::compute_optimized(int eflag, int vflag)
-{
-  // if reneighboring took place do load_balance if requested
-  if (do_load_balance > 0 &&
-      (neighbor->ncalls != ncalls_neigh)) {
-    ghostinum = 0;
-    // reset local ghost neighbor lists
-    ncalls_neigh = neighbor->ncalls;
-    if (ilistmask_max < list->inum) {
-      memory->grow(ilistmask,list->inum,"PairSnap::ilistmask");
-      ilistmask_max = list->inum;
-    }
-    for (int i = 0; i < list->inum; i++)
-      ilistmask[i] = 1;
-
-    //multiple passes for loadbalancing
-    for (int i = 0; i < do_load_balance; i++)
-      load_balance();
-  }
-
-  int numpairs = 0;
-  for (int ii = 0; ii < list->inum; ii++) {
-    if ((do_load_balance <= 0) || ilistmask[ii]) {
-      int i = list->ilist[ii];
-      int jnum = list->numneigh[i];
-      numpairs += jnum;
-    }
-  }
-
-  if (do_load_balance)
-    for (int ii = 0; ii < ghostinum; ii++) {
-      int i = ghostilist[ii];
-      int jnum = ghostnumneigh[i];
-      numpairs += jnum;
-    }
-
-  // optimized schedule setting
-
-  int time_dynamic = 0;
-  int time_guided = 0;
-
-  if (schedule_user == 0) schedule_user = 4;
-
-  switch (schedule_user) {
-  case 1:
-    omp_set_schedule(omp_sched_static,1);
-    break;
-  case 2:
-    omp_set_schedule(omp_sched_dynamic,1);
-    break;
-  case 3:
-    omp_set_schedule(omp_sched_guided,2);
-    break;
-  case 4:
-    omp_set_schedule(omp_sched_auto,0);
-    break;
-  case 5:
-    if (numpairs < 8*nthreads) omp_set_schedule(omp_sched_dynamic,1);
-    else if (schedule_time_guided < 0.0) {
-      omp_set_schedule(omp_sched_guided,2);
-      if (!eflag && !vflag) time_guided = 1;
-    } else if (schedule_time_dynamic<0.0) {
-      omp_set_schedule(omp_sched_dynamic,1);
-      if (!eflag && !vflag) time_dynamic = 1;
-    } else if (schedule_time_guided<schedule_time_dynamic)
-      omp_set_schedule(omp_sched_guided,2);
-    else
-      omp_set_schedule(omp_sched_dynamic,1);
-    break;
-  }
-
-  if (use_shared_arrays)
-    build_per_atom_arrays();
-
-#if defined(_OPENMP)
-#pragma omp parallel shared(eflag,vflag,time_dynamic,time_guided) firstprivate(numpairs) default(none)
-#endif
-  {
-    // begin of pragma omp parallel
-
-    int tid = omp_get_thread_num();
-    int** pairs_tid_unique = NULL;
-
-    int** pairs;
-    if (use_shared_arrays) pairs = i_pairs;
-    else {
-      memory->create(pairs_tid_unique,numpairs,4,"numpairs");
-      pairs = pairs_tid_unique;
-    }
-
-    if (!use_shared_arrays) {
-      numpairs = 0;
-      for (int ii = 0; ii < list->inum; ii++) {
-        if ((do_load_balance <= 0) || ilistmask[ii]) {
-          int i = list->ilist[ii];
-          int jnum = list->numneigh[i];
-          for (int jj = 0; jj<jnum; jj++) {
-            pairs[numpairs][0] = i;
-            pairs[numpairs][1] = jj;
-            pairs[numpairs][2] = -1;
-            numpairs++;
-          }
-        }
-      }
-
-      for (int ii = 0; ii < ghostinum; ii++) {
-        int i = ghostilist[ii];
-        int jnum = ghostnumneigh[i];
-        for (int jj = 0; jj<jnum; jj++) {
-          pairs[numpairs][0] = i;
-          pairs[numpairs][1] = jj;
-          pairs[numpairs][2] = -1;
-          numpairs++;
-        }
-      }
-    }
-
-    int ielem;
-    int jj,k,jnum,jtype,ninside;
-    double delx,dely,delz,evdwl,rsq;
-    double fij[3];
-    int *jlist,*numneigh,**firstneigh;
-    evdwl = 0.0;
-
-#if defined(_OPENMP)
-#pragma omp master
-#endif
-    {
-      ev_init(eflag,vflag);
-    }
-
-#if defined(_OPENMP)
-#pragma omp barrier
-    { ; }
-#endif
-
-    double **x = atom->x;
-    double **f = atom->f;
-    int *type = atom->type;
-    int nlocal = atom->nlocal;
-    int newton_pair = force->newton_pair;
-
-    numneigh = list->numneigh;
-    firstneigh = list->firstneigh;
-
-#ifdef TIMING_INFO
-    // only update micro timers after setup
-    static int count=0;
-    if (count<2) {
-      sna[tid]->timers[0] = 0;
-      sna[tid]->timers[1] = 0;
-      sna[tid]->timers[2] = 0;
-      sna[tid]->timers[3] = 0;
-      sna[tid]->timers[4] = 0;
-    }
-    count++;
-#endif
-
-    // did thread start working on interactions of new atom
-    int iold = -1;
-
-    double starttime, endtime;
-    if (time_dynamic || time_guided)
-      starttime = MPI_Wtime();
-
-#if defined(_OPENMP)
-#pragma omp for schedule(runtime)
-#endif
-    for (int iijj = 0; iijj < numpairs; iijj++) {
-      int i = 0;
-      if (use_shared_arrays) {
-        i = i_pairs[iijj][0];
-        if (iold != i) {
-          set_sna_to_shared(tid,i_pairs[iijj][3]);
-          ielem = map[type[i]];
-        }
-        iold = i;
-      } else {
-        i = pairs[iijj][0];
-        if (iold != i) {
-          iold = i;
-          const double xtmp = x[i][0];
-          const double ytmp = x[i][1];
-          const double ztmp = x[i][2];
-          const int itype = type[i];
-          ielem = map[itype];
-          const double radi = radelem[ielem];
-
-          if (i < nlocal) {
-            jlist = firstneigh[i];
-            jnum = numneigh[i];
-          } else {
-            jlist = ghostneighs+ghostfirstneigh[i];
-            jnum = ghostnumneigh[i];
-          }
-
-          // insure rij, inside, wj, and rcutij are of size jnum
-
-          sna[tid]->grow_rij(jnum);
-
-          // rij[][3] = displacements between atom I and those neighbors
-          // inside = indices of neighbors of I within cutoff
-          // wj = weights of neighbors of I within cutoff
-          // rcutij = cutoffs of neighbors of I within cutoff
-          // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi
-
-          ninside = 0;
-          for (jj = 0; jj < jnum; jj++) {
-            int j = jlist[jj];
-            j &= NEIGHMASK;
-            delx = x[j][0] - xtmp; //unitialised
-            dely = x[j][1] - ytmp;
-            delz = x[j][2] - ztmp;
-            rsq = delx*delx + dely*dely + delz*delz;
-            jtype = type[j];
-            int jelem = map[jtype];
-
-            if (rsq < cutsq[itype][jtype]&&rsq>1e-20) { //unitialised
-              sna[tid]->rij[ninside][0] = delx;
-              sna[tid]->rij[ninside][1] = dely;
-              sna[tid]->rij[ninside][2] = delz;
-              sna[tid]->inside[ninside] = j;
-              sna[tid]->wj[ninside] = wjelem[jelem];
-              sna[tid]->rcutij[ninside] = (radi + radelem[jelem])*rcutfac;
-              ninside++;
-
-              // update index list with inside index
-              pairs[iijj + (jj - pairs[iijj][1])][2] =
-                ninside-1; //unitialised
-            }
-          }
-
-          // compute Ui and Zi for atom I
-
-          sna[tid]->compute_ui(ninside); //unitialised
-          sna[tid]->compute_zi();
-        }
-      }
-      if (quadraticflag) {
-        sna[tid]->compute_bi();
-        sna[tid]->copy_bi2bvec();
-      }
-
-      // for neighbors of I within cutoff:
-      // compute dUi/drj and dBi/drj
-      // Fij = dEi/dRj = -dEi/dRi => add to Fi, subtract from Fj
-
-      // entry into loop if inside index is set
-
-      double* coeffi = coeffelem[ielem];
-
-      if (pairs[iijj][2] >= 0) {
-        jj = pairs[iijj][2];
-        int j = sna[tid]->inside[jj];
-        sna[tid]->compute_duidrj(sna[tid]->rij[jj],
-                                 sna[tid]->wj[jj],sna[tid]->rcutij[jj]);
-
-        sna[tid]->compute_dbidrj();
-        sna[tid]->copy_dbi2dbvec();
-
-        fij[0] = 0.0;
-        fij[1] = 0.0;
-        fij[2] = 0.0;
-
-        // linear contributions
-
-        for (k = 1; k <= ncoeff; k++) {
-          double bgb = coeffi[k];
-          fij[0] += bgb*sna[tid]->dbvec[k-1][0];
-          fij[1] += bgb*sna[tid]->dbvec[k-1][1];
-          fij[2] += bgb*sna[tid]->dbvec[k-1][2];
-        }
-
-      // quadratic contributions
-
-      if (quadraticflag) {
-        int k = ncoeff+1;
-        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          double bveci = sna[tid]->bvec[icoeff];
-          double fack = coeffi[k]*bveci;
-          double* dbveci = sna[tid]->dbvec[icoeff];
-          fij[0] += fack*sna[tid]->dbvec[icoeff][0];
-          fij[1] += fack*sna[tid]->dbvec[icoeff][1];
-          fij[2] += fack*sna[tid]->dbvec[icoeff][2];
-          k++;
-          for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-            double facki = coeffi[k]*bveci;
-            double fackj = coeffi[k]*sna[tid]->bvec[jcoeff];
-            double* dbvecj = sna[tid]->dbvec[jcoeff];
-            fij[0] += facki*dbvecj[0]+fackj*dbveci[0];
-            fij[1] += facki*dbvecj[1]+fackj*dbveci[1];
-            fij[2] += facki*dbvecj[2]+fackj*dbveci[2];
-            k++;
-          }
-        }
-      }
-
-#if defined(_OPENMP)
-#pragma omp critical
-#endif
-        {
-          f[i][0] += fij[0];
-          f[i][1] += fij[1];
-          f[i][2] += fij[2];
-          f[j][0] -= fij[0];
-          f[j][1] -= fij[1];
-          f[j][2] -= fij[2];
-
-          // tally per-atom virial contribution
-
-          if (vflag)
-            ev_tally_xyz(i,j,nlocal,newton_pair,0.0,0.0,
-                         fij[0],fij[1],fij[2],
-                         -sna[tid]->rij[jj][0],-sna[tid]->rij[jj][1],
-                         -sna[tid]->rij[jj][2]);
-        }
-      }
-
-      // evdwl = energy of atom I, sum over coeffs_k * Bi_k
-      // only call this for first pair of each atom i
-      // if atom has no pairs, eatom=0, which is wrong
-
-      if (eflag&&pairs[iijj][1] == 0) {
-        evdwl = coeffi[0];
-
-        if (!quadraticflag) {
-          sna[tid]->compute_bi();
-          sna[tid]->copy_bi2bvec();
-        }
-
-        // E = beta.B + 0.5*B^t.alpha.B
-        // coeff[k] = beta[k-1] or
-        // coeff[k] = alpha_ii or
-        // coeff[k] = alpha_ij = alpha_ji, j != i
-
-        // linear contributions
-
-        for (int k = 1; k <= ncoeff; k++)
-          evdwl += coeffi[k]*sna[tid]->bvec[k-1];
-
-        // quadratic contributions
-
-        if (quadraticflag) {
-          int k = ncoeff+1;
-          for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-            double bveci = sna[tid]->bvec[icoeff];
-            evdwl += 0.5*coeffi[k++]*bveci*bveci;
-            for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-              evdwl += coeffi[k++]*bveci*sna[tid]->bvec[jcoeff];
-            }
-          }
-        }
-
-#if defined(_OPENMP)
-#pragma omp critical
-#endif
-        ev_tally_full(i,2.0*evdwl,0.0,0.0,0.0,0.0,0.0);
-      }
-
-    }
-    if (time_dynamic || time_guided)
-      endtime = MPI_Wtime();
-    if (time_dynamic) schedule_time_dynamic = endtime - starttime;
-    if (time_guided) schedule_time_guided = endtime - starttime;
-    if (!use_shared_arrays) memory->destroy(pairs);
-
-  }// end of pragma omp parallel
-
-  if (vflag_fdotr) virial_fdotr_compute();
-
-}
-
-inline int PairNNSNAP::equal(double* x,double* y)
-{
-  double dist2 =
-    (x[0]-y[0])*(x[0]-y[0]) +
-    (x[1]-y[1])*(x[1]-y[1]) +
-    (x[2]-y[2])*(x[2]-y[2]);
-  if (dist2 < 1e-20) return 1;
-  return 0;
-}
-
-inline double PairNNSNAP::dist2(double* x,double* y)
-{
-  return
-    (x[0]-y[0])*(x[0]-y[0]) +
-    (x[1]-y[1])*(x[1]-y[1]) +
-    (x[2]-y[2])*(x[2]-y[2]);
-}
-
-// return extra communication cutoff
-// extra_cutoff = max(subdomain_length)
-
-double PairNNSNAP::extra_cutoff()
-{
-  double sublo[3],subhi[3];
-
-  if (domain->triclinic == 0) {
-    for (int dim = 0 ; dim < 3 ; dim++) {
-      sublo[dim] = domain->sublo[dim];
-      subhi[dim] = domain->subhi[dim];
-    }
-  } else {
-    domain->lamda2x(domain->sublo_lamda,sublo);
-    domain->lamda2x(domain->subhi_lamda,subhi);
-  }
-
-  double sub_size[3];
-  for (int dim = 0; dim < 3; dim++)
-    sub_size[dim] = subhi[dim] - sublo[dim];
-
-  double max_sub_size = 0;
-  for (int dim = 0; dim < 3; dim++)
-    max_sub_size = MAX(max_sub_size,sub_size[dim]);
-
-  // note: for triclinic, probably need something different
-  // see Comm::setup()
-
-  return max_sub_size;
-}
-
-// micro load_balancer: each MPI process will
-// check with each of its 26 neighbors,
-// whether an imbalance exists in the number
-// of atoms to calculate forces for.
-// If it does it will set ilistmask of one of
-// its local atoms to zero, and send its Tag
-// to the neighbor process. The neighboring process
-// will check its ghost list for the
-// ghost atom with the same Tag which is closest
-// to its domain center, and build a
-// neighborlist for this ghost atom. For this to work,
-// the communication cutoff has to be
-// as large as the neighbor cutoff +
-// maximum subdomain length.
-
-// Note that at most one atom is exchanged per processor pair.
-
-// Also note that the local atom assignment
-// doesn't change. This load balancer will cause
-// some ghost atoms to have full neighborlists
-// which are unique to PairNNSNAP.
-// They are not part of the generally accessible neighborlist.
-// At the same time corresponding local atoms on
-// other MPI processes will not be
-// included in the force computation since
-// their ilistmask is 0. This does not effect
-// any other classes which might
-// access the same general neighborlist.
-// Reverse communication (newton on) of forces is required.
-
-// Currently the load balancer does two passes,
-// since its exchanging atoms upstream and downstream.
-
-void PairNNSNAP::load_balance()
-{
-  double sublo[3],subhi[3];
-  if (domain->triclinic == 0) {
-    double* sublotmp = domain->sublo;
-    double* subhitmp = domain->subhi;
-    for (int dim = 0 ; dim<3 ; dim++) {
-      sublo[dim]=sublotmp[dim];
-      subhi[dim]=subhitmp[dim];
-    }
-  } else {
-    double* sublotmp = domain->sublo_lamda;
-    double* subhitmp = domain->subhi_lamda;
-    domain->lamda2x(sublotmp,sublo);
-    domain->lamda2x(subhitmp,subhi);
-  }
-
-  //if (list->inum==0) list->grow(atom->nmax);
-
-  int nlocal = ghostinum;
-  for (int i=0; i < list->inum; i++)
-    if (ilistmask[i]) nlocal++;
-  int ***grid2proc = comm->grid2proc;
-  int* procgrid = comm->procgrid;
-
-  int nlocal_up,nlocal_down;
-  MPI_Request request;
-
-  double sub_mid[3];
-  for (int dim=0; dim<3; dim++)
-    sub_mid[dim] = (subhi[dim] + sublo[dim])/2;
-
-  if (comm->cutghostuser <
-      neighbor->cutneighmax+extra_cutoff())
-    error->all(FLERR,"Communication cutoff too small for SNAP micro load balancing");
-
-  int nrecv = ghostinum;
-  int totalsend = 0;
-  int nsend = 0;
-  int depth = 1;
-
-  for (int dx = -depth; dx < depth+1; dx++)
-    for (int dy = -depth; dy < depth+1; dy++)
-      for (int dz = -depth; dz < depth+1; dz++) {
-
-        if (dx == dy && dy == dz && dz == 0) continue;
-
-        int sendloc[3] = {comm->myloc[0],
-                          comm->myloc[1], comm->myloc[2]
-                         };
-        sendloc[0] += dx;
-        sendloc[1] += dy;
-        sendloc[2] += dz;
-        for (int dim = 0; dim < 3; dim++)
-          if (sendloc[dim] >= procgrid[dim])
-            sendloc[dim] = sendloc[dim] - procgrid[dim];
-        for (int dim = 0; dim < 3; dim++)
-          if (sendloc[dim] < 0)
-            sendloc[dim] = procgrid[dim] + sendloc[dim];
-        int recvloc[3] = {comm->myloc[0],
-                          comm->myloc[1], comm->myloc[2]
-                         };
-        recvloc[0] -= dx;
-        recvloc[1] -= dy;
-        recvloc[2] -= dz;
-        for (int dim = 0; dim < 3; dim++)
-          if (recvloc[dim] < 0)
-            recvloc[dim] = procgrid[dim] + recvloc[dim];
-        for (int dim = 0; dim < 3; dim++)
-          if (recvloc[dim] >= procgrid[dim])
-            recvloc[dim] = recvloc[dim] - procgrid[dim];
-
-        int sendproc = grid2proc[sendloc[0]][sendloc[1]][sendloc[2]];
-        int recvproc = grid2proc[recvloc[0]][recvloc[1]][recvloc[2]];
-
-        // two stage process, first upstream movement, then downstream
-
-        MPI_Sendrecv(&nlocal,1,MPI_INT,sendproc,0,
-                     &nlocal_up,1,MPI_INT,recvproc,0,world,MPI_STATUS_IGNORE);
-        MPI_Sendrecv(&nlocal,1,MPI_INT,recvproc,0,
-                     &nlocal_down,1,MPI_INT,sendproc,0,world,MPI_STATUS_IGNORE);
-        nsend = 0;
-
-        // send upstream
-
-        if (nlocal > nlocal_up+1) {
-
-          int i = totalsend++;
-          while(i < list->inum && ilistmask[i] == 0)
-            i = totalsend++;
-
-          if (i < list->inum)
-            MPI_Isend(&atom->tag[i],1,MPI_INT,recvproc,0,world,&request);
-          else {
-            int j = -1;
-            MPI_Isend(&j,1,MPI_INT,recvproc,0,world,&request);
-          }
-
-          if (i < list->inum) {
-            for (int j = 0; j < list->inum; j++)
-              if (list->ilist[j] == i)
-                ilistmask[j] = 0;
-            nsend = 1;
-          }
-        }
-
-        // recv downstream
-
-        if (nlocal < nlocal_down-1) {
-          nlocal++;
-          int get_tag = -1;
-          MPI_Recv(&get_tag,1,MPI_INT,sendproc,0,world,MPI_STATUS_IGNORE);
-
-          // if get_tag -1 the other process didnt have local atoms to send
-
-          if (get_tag >= 0) {
-            if (ghostinum >= ghostilist_max) {
-              memory->grow(ghostilist,ghostinum+10,
-                           "PairSnap::ghostilist");
-              ghostilist_max = ghostinum+10;
-            }
-            if (atom->nlocal + atom->nghost >= ghostnumneigh_max) {
-              ghostnumneigh_max = atom->nlocal+atom->nghost+100;
-              memory->grow(ghostnumneigh,ghostnumneigh_max,
-                           "PairSnap::ghostnumneigh");
-              memory->grow(ghostfirstneigh,ghostnumneigh_max,
-                           "PairSnap::ghostfirstneigh");
-            }
-
-            // find closest ghost image of the transfered particle
-
-            double mindist = 1e200;
-            int closestghost = -1;
-            for (int j = 0; j < atom->nlocal + atom->nghost; j++)
-              if (atom->tag[j] == get_tag)
-                if (dist2(sub_mid, atom->x[j]) < mindist) {
-                  closestghost = j;
-                  mindist = dist2(sub_mid, atom->x[j]);
-                }
-
-            // build neighborlist for this particular
-            // ghost atom, and add it to list->ilist
-
-            if (ghostneighs_max - ghostneighs_total <
-                neighbor->oneatom) {
-              memory->grow(ghostneighs,
-                           ghostneighs_total + neighbor->oneatom,
-                           "PairSnap::ghostneighs");
-              ghostneighs_max = ghostneighs_total + neighbor->oneatom;
-            }
-
-            int j = closestghost;
-
-            ghostilist[ghostinum] = j;
-            ghostnumneigh[j] = 0;
-            ghostfirstneigh[j] = ghostneighs_total;
-
-            ghostinum++;
-            int* jlist = ghostneighs + ghostfirstneigh[j];
-
-            // find all neighbors by looping
-            // over all local and ghost atoms
-
-            for (int k = 0; k < atom->nlocal + atom->nghost; k++)
-              if (dist2(atom->x[j],atom->x[k]) <
-                  neighbor->cutneighmax*neighbor->cutneighmax) {
-                jlist[ghostnumneigh[j]] = k;
-                ghostnumneigh[j]++;
-                ghostneighs_total++;
-              }
-          }
-
-          if (get_tag >= 0) nrecv++;
-        }
-
-        // decrease nlocal later, so that it is the
-        // initial number both for receiving and sending
-
-        if (nsend) nlocal--;
-
-        // second pass through the grid
-
-        MPI_Sendrecv(&nlocal,1,MPI_INT,sendproc,0,
-                     &nlocal_up,1,MPI_INT,recvproc,0,world,MPI_STATUS_IGNORE);
-        MPI_Sendrecv(&nlocal,1,MPI_INT,recvproc,0,
-                     &nlocal_down,1,MPI_INT,sendproc,0,world,MPI_STATUS_IGNORE);
-
-        // send downstream
-
-        nsend=0;
-        if (nlocal > nlocal_down+1) {
-          int i = totalsend++;
-          while(i < list->inum && ilistmask[i]==0) i = totalsend++;
-
-          if (i < list->inum)
-            MPI_Isend(&atom->tag[i],1,MPI_INT,sendproc,0,world,&request);
-          else {
-            int j =- 1;
-            MPI_Isend(&j,1,MPI_INT,sendproc,0,world,&request);
-          }
-
-          if (i < list->inum) {
-            for (int j=0; j<list->inum; j++)
-              if (list->ilist[j] == i) ilistmask[j] = 0;
-            nsend = 1;
-          }
-        }
-
-        // receive upstream
-
-        if (nlocal < nlocal_up-1) {
-          nlocal++;
-          int get_tag = -1;
-
-          MPI_Recv(&get_tag,1,MPI_INT,recvproc,0,world,MPI_STATUS_IGNORE);
-
-          if (get_tag >= 0) {
-            if (ghostinum >= ghostilist_max) {
-              memory->grow(ghostilist,ghostinum+10,
-                           "PairSnap::ghostilist");
-              ghostilist_max = ghostinum+10;
-            }
-            if (atom->nlocal + atom->nghost >= ghostnumneigh_max) {
-              ghostnumneigh_max = atom->nlocal + atom->nghost + 100;
-              memory->grow(ghostnumneigh,ghostnumneigh_max,
-                           "PairSnap::ghostnumneigh");
-              memory->grow(ghostfirstneigh,ghostnumneigh_max,
-                           "PairSnap::ghostfirstneigh");
-            }
-
-            // find closest ghost image of the transfered particle
-
-            double mindist = 1e200;
-            int closestghost = -1;
-            for (int j = 0; j < atom->nlocal + atom->nghost; j++)
-              if (atom->tag[j] == get_tag)
-                if (dist2(sub_mid,atom->x[j])<mindist) {
-                  closestghost = j;
-                  mindist = dist2(sub_mid,atom->x[j]);
-                }
-
-            // build neighborlist for this particular ghost atom
-
-            if (ghostneighs_max-ghostneighs_total < neighbor->oneatom) {
-              memory->grow(ghostneighs,ghostneighs_total + neighbor->oneatom,
-                           "PairSnap::ghostneighs");
-              ghostneighs_max = ghostneighs_total + neighbor->oneatom;
-            }
-
-            int j = closestghost;
-
-            ghostilist[ghostinum] = j;
-            ghostnumneigh[j] = 0;
-            ghostfirstneigh[j] = ghostneighs_total;
-
-            ghostinum++;
-            int* jlist = ghostneighs + ghostfirstneigh[j];
-
-            for (int k = 0; k < atom->nlocal + atom->nghost; k++)
-              if (dist2(atom->x[j],atom->x[k]) <
-                  neighbor->cutneighmax*neighbor->cutneighmax) {
-                jlist[ghostnumneigh[j]] = k;
-                ghostnumneigh[j]++;
-                ghostneighs_total++;
-              }
-          }
-
-          if (get_tag >= 0) nrecv++;
-        }
-        if (nsend) nlocal--;
-      }
-}
-
-void PairNNSNAP::set_sna_to_shared(int snaid,int i)
-{
-  sna[snaid]->rij = i_rij[i];
-  sna[snaid]->inside = i_inside[i];
-  sna[snaid]->wj = i_wj[i];
-  sna[snaid]->rcutij = i_rcutij[i];
-  sna[snaid]->zarray_r = i_zarray_r[i];
-  sna[snaid]->zarray_i = i_zarray_i[i];
-  sna[snaid]->uarraytot_r = i_uarraytot_r[i];
-  sna[snaid]->uarraytot_i = i_uarraytot_i[i];
-}
-
-void PairNNSNAP::build_per_atom_arrays()
-{
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&starttime);
-#endif
-
-  int count = 0;
-  int neighmax = 0;
-  for (int ii = 0; ii < list->inum; ii++)
-    if ((do_load_balance <= 0) || ilistmask[ii]) {
-      neighmax=MAX(neighmax,list->numneigh[list->ilist[ii]]);
-      ++count;
-    }
-  for (int ii = 0; ii < ghostinum; ii++) {
-    neighmax=MAX(neighmax,ghostnumneigh[ghostilist[ii]]);
-    ++count;
-  }
-
-  if (i_max < count || i_neighmax < neighmax) {
-    int i_maxt = MAX(count,i_max);
-    i_neighmax = MAX(neighmax,i_neighmax);
-    memory->destroy(i_rij);
-    memory->destroy(i_inside);
-    memory->destroy(i_wj);
-    memory->destroy(i_rcutij);
-    memory->destroy(i_ninside);
-    memory->destroy(i_pairs);
-    memory->create(i_rij,i_maxt,i_neighmax,3,"PairNNSNAP::i_rij");
-    memory->create(i_inside,i_maxt,i_neighmax,"PairNNSNAP::i_inside");
-    memory->create(i_wj,i_maxt,i_neighmax,"PairNNSNAP::i_wj");
-    memory->create(i_rcutij,i_maxt,i_neighmax,"PairNNSNAP::i_rcutij");
-    memory->create(i_ninside,i_maxt,"PairNNSNAP::i_ninside");
-    memory->create(i_pairs,i_maxt*i_neighmax,4,"PairNNSNAP::i_pairs");
-  }
-
-  if (i_max < count) {
-    int jdim = sna[0]->twojmax+1;
-    memory->destroy(i_uarraytot_r);
-    memory->destroy(i_uarraytot_i);
-    memory->create(i_uarraytot_r,count,jdim,jdim,jdim,
-                   "PairNNSNAP::i_uarraytot_r");
-    memory->create(i_uarraytot_i,count,jdim,jdim,jdim,
-                   "PairNNSNAP::i_uarraytot_i");
-    if (i_zarray_r != NULL)
-      for (int i = 0; i < i_max; i++) {
-        memory->destroy(i_zarray_r[i]);
-        memory->destroy(i_zarray_i[i]);
-      }
-
-    delete [] i_zarray_r;
-    delete [] i_zarray_i;
-    i_zarray_r = new double*****[count];
-    i_zarray_i = new double*****[count];
-    for (int i = 0; i < count; i++) {
-      memory->create(i_zarray_r[i],jdim,jdim,jdim,jdim,jdim,
-                     "PairNNSNAP::i_zarray_r");
-      memory->create(i_zarray_i[i],jdim,jdim,jdim,jdim,jdim,
-                     "PairNNSNAP::i_zarray_i");
-    }
-  }
-
-  if (i_max < count)
-    i_max = count;
-
-  count = 0;
-  i_numpairs = 0;
-  for (int ii = 0; ii < list->inum; ii++) {
-    if ((do_load_balance <= 0) || ilistmask[ii]) {
-      int i = list->ilist[ii];
-      int jnum = list->numneigh[i];
-      int* jlist = list->firstneigh[i];
-      const double xtmp = atom->x[i][0];
-      const double ytmp = atom->x[i][1];
-      const double ztmp = atom->x[i][2];
-      const int itype = atom->type[i];
-      const int ielem = map[itype];
-      const double radi = radelem[ielem];
-      int ninside = 0;
-      for (int jj = 0; jj < jnum; jj++) {
-        int j = jlist[jj];
-        j &= NEIGHMASK;
-        const double delx = atom->x[j][0] - xtmp;
-        const double dely = atom->x[j][1] - ytmp;
-        const double delz = atom->x[j][2] - ztmp;
-        const double rsq = delx*delx + dely*dely + delz*delz;
-        int jtype = atom->type[j];
-        int jelem = map[jtype];
-
-        i_pairs[i_numpairs][0] = i;
-        i_pairs[i_numpairs][1] = jj;
-        i_pairs[i_numpairs][2] = -1;
-        i_pairs[i_numpairs][3] = count;
-        if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-          i_rij[count][ninside][0] = delx;
-          i_rij[count][ninside][1] = dely;
-          i_rij[count][ninside][2] = delz;
-          i_inside[count][ninside] = j;
-          i_wj[count][ninside] = wjelem[jelem];
-          i_rcutij[count][ninside] = (radi + radelem[jelem])*rcutfac;
-
-          // update index list with inside index
-          i_pairs[i_numpairs][2] = ninside++;
-        }
-        i_numpairs++;
-      }
-      i_ninside[count] = ninside;
-      count++;
-    }
-  }
-
-  for (int ii = 0; ii < ghostinum; ii++) {
-    int i = ghostilist[ii];
-    int jnum = ghostnumneigh[i];
-    int* jlist = ghostneighs+ghostfirstneigh[i];
-    const double xtmp = atom->x[i][0];
-    const double ytmp = atom->x[i][1];
-    const double ztmp = atom->x[i][2];
-    const int itype = atom->type[i];
-    const int ielem = map[itype];
-    const double radi = radelem[ielem];
-    int ninside = 0;
-
-    for (int jj = 0; jj < jnum; jj++) {
-      int j = jlist[jj];
-      j &= NEIGHMASK;
-      const double delx = atom->x[j][0] - xtmp;
-      const double dely = atom->x[j][1] - ytmp;
-      const double delz = atom->x[j][2] - ztmp;
-      const double rsq = delx*delx + dely*dely + delz*delz;
-      int jtype = atom->type[j];
-      int jelem = map[jtype];
-
-      i_pairs[i_numpairs][0] = i;
-      i_pairs[i_numpairs][1] = jj;
-      i_pairs[i_numpairs][2] = -1;
-      i_pairs[i_numpairs][3] = count;
-      if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-        i_rij[count][ninside][0] = delx;
-        i_rij[count][ninside][1] = dely;
-        i_rij[count][ninside][2] = delz;
-        i_inside[count][ninside] = j;
-        i_wj[count][ninside] = wjelem[jelem];
-        i_rcutij[count][ninside] = (radi + radelem[jelem])*rcutfac;
-        // update index list with inside index
-        i_pairs[i_numpairs][2] = ninside++;
-      }
-      i_numpairs++;
-    }
-    i_ninside[count] = ninside;
-    count++;
-  }
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&endtime);
-  timers[0]+=(endtime.tv_sec-starttime.tv_sec+1.0*
-              (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-#endif
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&starttime);
-#endif
-
-#if defined(_OPENMP)
-#pragma omp parallel for shared(count) default(none)
-#endif
-  for (int ii=0; ii < count; ii++) {
-    int tid = omp_get_thread_num();
-    set_sna_to_shared(tid,ii);
-    //sna[tid]->compute_ui(i_ninside[ii]);
-#ifdef TIMING_INFO
-    clock_gettime(CLOCK_REALTIME,&starttime);
-#endif
-    sna[tid]->compute_ui_omp(i_ninside[ii],MAX(int(nthreads/count),1));
-#ifdef TIMING_INFO
-    clock_gettime(CLOCK_REALTIME,&endtime);
-    sna[tid]->timers[0]+=(endtime.tv_sec-starttime.tv_sec+1.0*
-                          (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-#endif
-  }
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&starttime);
-#endif
-  for (int ii=0; ii < count; ii++) {
-    int tid = 0;//omp_get_thread_num();
-    set_sna_to_shared(tid,ii);
-    sna[tid]->compute_zi_omp(MAX(int(nthreads/count),1));
-  }
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&endtime);
-  sna[0]->timers[1]+=(endtime.tv_sec-starttime.tv_sec+1.0*
-                      (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-#endif
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&endtime);
-  timers[1]+=(endtime.tv_sec-starttime.tv_sec+1.0*
-              (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-#endif
-}
-
-/* ----------------------------------------------------------------------
-   compute beta
-------------------------------------------------------------------------- */
-
-void PairNNSNAP::compute_beta()
-{
-  int i;
-  int *type = atom->type;
-
-  for (int ii = 0; ii < list->inum; ii++) {
-    i = list->ilist[ii];
-    const int itype = type[i];
-    const int ielem = map[itype];
-    double* coeffi = coeffelem[ielem];
-
-    for (int k = 1; k <= ncoeff; k++)
-      beta[ii][k-1] = coeffi[k];
-  }
-}
-
-/* ----------------------------------------------------------------------
-   allocate all arrays
-------------------------------------------------------------------------- */
-
-void PairNNSNAP::allocate()
-{
-  allocated = 1;
-  int n = atom->ntypes;
-
-  memory->create(setflag,n+1,n+1,"pair:setflag");
-  memory->create(cutsq,n+1,n+1,"pair:cutsq");
-  memory->create(map,n+1,"pair:map");
-}
-
-/* ----------------------------------------------------------------------
-   global settings
-------------------------------------------------------------------------- */
-
-void PairNNSNAP::settings(int narg, char **arg)
-{
-
-  // set default values for optional arguments
-
-  nthreads = -1;
-  use_shared_arrays=-1;
-  do_load_balance = 0;
-  use_optimized = 1;
-
-  // optional arguments
-
-  for (int i=0; i < narg; i++) {
-    if (i+2>narg) error->all(FLERR,"Illegal pair_style command");
-    if (strcmp(arg[i],"nthreads")==0) {
-      nthreads=force->inumeric(FLERR,arg[++i]);
-#if defined(LMP_USER_OMP)
-      error->all(FLERR,"Must set number of threads via package omp command");
-#else
-      omp_set_num_threads(nthreads);
-      comm->nthreads=nthreads;
-#endif
-      continue;
-    }
-    if (strcmp(arg[i],"optimized")==0) {
-      use_optimized=force->inumeric(FLERR,arg[++i]);
-      continue;
-    }
-    if (strcmp(arg[i],"shared")==0) {
-      use_shared_arrays=force->inumeric(FLERR,arg[++i]);
-      continue;
-    }
-    if (strcmp(arg[i],"loadbalance")==0) {
-      do_load_balance = force->inumeric(FLERR,arg[++i]);
-      if (do_load_balance) {
-        double mincutoff = extra_cutoff() +
-          rcutmax + neighbor->skin;
-        if (comm->cutghostuser < mincutoff) {
-          char buffer[255];
-
-          //apparently mincutoff is 0 after sprintf command ?????
-
-          double tmp = mincutoff + 0.1;
-          sprintf(buffer, "Communication cutoff is too small "
-                  "for SNAP micro load balancing, increased to %lf",
-                  mincutoff+0.1);
-          if (comm->me==0)
-            error->warning(FLERR,buffer);
-
-          comm->cutghostuser = tmp;
-
-        }
-      }
-      continue;
-    }
-    if (strcmp(arg[i],"schedule")==0) {
-      i++;
-      if (strcmp(arg[i],"static")==0)
-        schedule_user = 1;
-      if (strcmp(arg[i],"dynamic")==0)
-        schedule_user = 2;
-      if (strcmp(arg[i],"guided")==0)
-        schedule_user = 3;
-      if (strcmp(arg[i],"auto")==0)
-        schedule_user = 4;
-      if (strcmp(arg[i],"determine")==0)
-        schedule_user = 5;
-      if (schedule_user == 0)
-        error->all(FLERR,"Illegal pair_style command");
-      continue;
-    }
-    error->all(FLERR,"Illegal pair_style command");
-  }
-
-  if (nthreads < 0)
-    nthreads = comm->nthreads;
-
-  if (use_shared_arrays < 0) {
-    if (nthreads > 1 && atom->nlocal <= 2*nthreads)
-      use_shared_arrays = 1;
-    else use_shared_arrays = 0;
-  }
-
-  // check if running non-optimized code with
-  // optimization flags set
-
-  if (!use_optimized)
-    if (nthreads > 1 ||
-        use_shared_arrays ||
-        do_load_balance ||
-        schedule_user)
-      error->all(FLERR,"Illegal pair_style command");
-}
-
-/* ----------------------------------------------------------------------
-   set coeffs for one or more type pairs
-------------------------------------------------------------------------- */
-
-void PairNNSNAP::coeff(int narg, char **arg)
-{
-  if (narg < 5) error->all(FLERR,"Incorrect args for pair coefficients");
-  if (!allocated) allocate();
-
-  if (nelements) {
-    for (int i = 0; i < nelements; i++)
-      delete[] elements[i];
-    delete[] elements;
-    memory->destroy(radelem);
-    memory->destroy(wjelem);
-    memory->destroy(coeffelem);
-    memory->destroy(beta);
-  }
-
-  char* type1 = arg[0];
-  char* type2 = arg[1];
-  char* coefffilename = arg[2];
-  char* paramfilename = arg[3];
-  char** elemtypes = &arg[4];
-
-  // insure I,J args are * *
-
-  if (strcmp(type1,"*") != 0 || strcmp(type2,"*") != 0)
-    error->all(FLERR,"Incorrect args for pair coefficients");
-
-  // read snapcoeff and snapparam files
-
-  read_files(coefffilename,paramfilename);
-
-  if (!quadraticflag)
-    ncoeff = ncoeffall - 1;
-  else {
-
-    // ncoeffall should be (ncoeff+2)*(ncoeff+1)/2
-    // so, ncoeff = floor(sqrt(2*ncoeffall))-1
-
-    ncoeff = sqrt(2*ncoeffall)-1;
-    ncoeffq = (ncoeff*(ncoeff+1))/2;
-    int ntmp = 1+ncoeff+ncoeffq;
-    if (ntmp != ncoeffall) {
-      printf("ncoeffall = %d ntmp = %d ncoeff = %d \n",ncoeffall,ntmp,ncoeff);
-      error->all(FLERR,"Incorrect SNAP coeff file");
-    }
-  }
-
-  // read args that map atom types to SNAP elements
-  // map[i] = which element the Ith atom type is, -1 if not mapped
-  // map[0] is not used
-
-  for (int i = 1; i <= atom->ntypes; i++) {
-    char* elemname = elemtypes[i-1];
-    int jelem;
-    for (jelem = 0; jelem < nelements; jelem++)
-      if (strcmp(elemname,elements[jelem]) == 0)
-        break;
-
-    if (jelem < nelements)
-      map[i] = jelem;
-    else if (strcmp(elemname,"NULL") == 0) map[i] = -1;
-    else error->all(FLERR,"Incorrect args for pair coefficients");
-  }
-
-  // clear setflag since coeff() called once with I,J = * *
-
-  int n = atom->ntypes;
-  for (int i = 1; i <= n; i++)
-    for (int j = i; j <= n; j++)
-      setflag[i][j] = 0;
-
-  // set setflag i,j for type pairs where both are mapped to elements
-
-  int count = 0;
-  for (int i = 1; i <= n; i++)
-    for (int j = i; j <= n; j++)
-      if (map[i] >= 0 && map[j] >= 0) {
-        setflag[i][j] = 1;
-        count++;
-      }
-
-  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
-
-  sna = new SNA*[nthreads];
-
-  // allocate memory for per OpenMP thread data which
-  // is wrapped into the sna class
-
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    sna[tid] = new SNA(lmp,rfac0,twojmax,
-                       diagonalstyle,use_shared_arrays,
-                       rmin0,switchflag,bzeroflag);
-    if (!use_shared_arrays)
-      sna[tid]->grow_rij(nmax);
-  }
-
-  if (ncoeff != sna[0]->ncoeff) {
-    if (comm->me == 0)
-      printf("ncoeff = %d snancoeff = %d \n",ncoeff,sna[0]->ncoeff);
-    error->all(FLERR,"Incorrect SNAP parameter file");
-  }
-
-  // Calculate maximum cutoff for all elements
-
-  rcutmax = 0.0;
-  for (int ielem = 0; ielem < nelements; ielem++)
-    rcutmax = MAX(2.0*radelem[ielem]*rcutfac,rcutmax);
-
-}
-
-/* ----------------------------------------------------------------------
-   init specific to this pair style
-------------------------------------------------------------------------- */
-
-void PairNNSNAP::init_style()
-{
-  if (force->newton_pair == 0)
-    error->all(FLERR,"Pair style SNAP requires newton pair on");
-
-  // need a full neighbor list
-
-  int irequest = neighbor->request(this,instance_me);
-  neighbor->requests[irequest]->half = 0;
-  neighbor->requests[irequest]->full = 1;
-
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    sna[tid]->init();
-  }
-
-}
-
-/* ----------------------------------------------------------------------
-   init for one type pair i,j and corresponding j,i
-------------------------------------------------------------------------- */
-
-double PairNNSNAP::init_one(int i, int j)
-{
-  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
-  return (radelem[map[i]] +
-          radelem[map[j]])*rcutfac;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairNNSNAP::read_files(char *coefffilename, char *paramfilename)
-{
-
-  // open SNAP coefficient file on proc 0
-
-  FILE *fpcoeff;
-  if (comm->me == 0) {
-    fpcoeff = force->open_potential(coefffilename);
-    if (fpcoeff == NULL) {
-      char str[128];
-      snprintf(str,128,"Cannot open SNAP coefficient file %s",coefffilename);
-      error->one(FLERR,str);
-    }
-  }
-
-  char line[MAXLINE],*ptr;
-  int eof = 0;
-
-  int n;
-  int nwords = 0;
-  while (nwords == 0) {
-    if (comm->me == 0) {
-      ptr = fgets(line,MAXLINE,fpcoeff);
-      if (ptr == NULL) {
-        eof = 1;
-        fclose(fpcoeff);
-      } else n = strlen(line) + 1;
-    }
-    MPI_Bcast(&eof,1,MPI_INT,0,world);
-    if (eof) break;
-    MPI_Bcast(&n,1,MPI_INT,0,world);
-    MPI_Bcast(line,n,MPI_CHAR,0,world);
-
-    // strip comment, skip line if blank
-
-    if ((ptr = strchr(line,'#'))) *ptr = '\0';
-    nwords = atom->count_words(line);
-  }
-  if (nwords != 2)
-    error->all(FLERR,"Incorrect format in SNAP coefficient file");
-
-  // words = ptrs to all words in line
-  // strip single and double quotes from words
-
-  char* words[MAXWORD];
-  int iword = 0;
-  words[iword] = strtok(line,"' \t\n\r\f");
-  iword = 1;
-  words[iword] = strtok(NULL,"' \t\n\r\f");
-
-  nelements = atoi(words[0]);
-  ncoeffall = atoi(words[1]);
-
-  // set up element lists
-
-  elements = new char*[nelements];
-  memory->create(radelem,nelements,"pair:radelem");
-  memory->create(wjelem,nelements,"pair:wjelem");
-  memory->create(coeffelem,nelements,ncoeffall,"pair:coeffelem");
-
-  // Loop over nelements blocks in the SNAP coefficient file
-
-  for (int ielem = 0; ielem < nelements; ielem++) {
-
-    if (comm->me == 0) {
-      ptr = fgets(line,MAXLINE,fpcoeff);
-      if (ptr == NULL) {
-        eof = 1;
-        fclose(fpcoeff);
-      } else n = strlen(line) + 1;
-    }
-    MPI_Bcast(&eof,1,MPI_INT,0,world);
-    if (eof)
-      error->all(FLERR,"Incorrect format in SNAP coefficient file");
-    MPI_Bcast(&n,1,MPI_INT,0,world);
-    MPI_Bcast(line,n,MPI_CHAR,0,world);
-
-    nwords = atom->count_words(line);
-    if (nwords != 3)
-      error->all(FLERR,"Incorrect format in SNAP coefficient file");
-
-    iword = 0;
-    words[iword] = strtok(line,"' \t\n\r\f");
-    iword = 1;
-    words[iword] = strtok(NULL,"' \t\n\r\f");
-    iword = 2;
-    words[iword] = strtok(NULL,"' \t\n\r\f");
-
-    char* elemtmp = words[0];
-    int n = strlen(elemtmp) + 1;
-    elements[ielem] = new char[n];
-    strcpy(elements[ielem],elemtmp);
-
-    radelem[ielem] = atof(words[1]);
-    wjelem[ielem] = atof(words[2]);
-
-
-    if (comm->me == 0) {
-      if (screen) fprintf(screen,"SNAP Element = %s, Radius %g, Weight %g \n",
-                          elements[ielem], radelem[ielem], wjelem[ielem]);
-      if (logfile) fprintf(logfile,"SNAP Element = %s, Radius %g, Weight %g \n",
-                          elements[ielem], radelem[ielem], wjelem[ielem]);
-    }
-
-    for (int icoeff = 0; icoeff < ncoeffall; icoeff++) {
-      if (comm->me == 0) {
-        ptr = fgets(line,MAXLINE,fpcoeff);
-        if (ptr == NULL) {
-          eof = 1;
-          fclose(fpcoeff);
-        } else n = strlen(line) + 1;
-      }
-
-      MPI_Bcast(&eof,1,MPI_INT,0,world);
-      if (eof)
-        error->all(FLERR,"Incorrect format in SNAP coefficient file");
-      MPI_Bcast(&n,1,MPI_INT,0,world);
-      MPI_Bcast(line,n,MPI_CHAR,0,world);
-
-      nwords = atom->count_words(line);
-      if (nwords != 1)
-        error->all(FLERR,"Incorrect format in SNAP coefficient file");
-
-      iword = 0;
-      words[iword] = strtok(line,"' \t\n\r\f");
-
-      coeffelem[ielem][icoeff] = atof(words[0]);
-
-    }
-  }
-
-  // set flags for required keywords
-
-  rcutfacflag = 0;
-  twojmaxflag = 0;
-
-  // Set defaults for optional keywords
-
-  rfac0 = 0.99363;
-  rmin0 = 0.0;
-  diagonalstyle = 3;
-  switchflag = 1;
-  bzeroflag = 1;
-  quadraticflag = 0;
-
-  // open SNAP parameter file on proc 0
-
-  FILE *fpparam;
-  if (comm->me == 0) {
-    fpparam = force->open_potential(paramfilename);
-    if (fpparam == NULL) {
-      char str[128];
-      snprintf(str,128,"Cannot open SNAP parameter file %s",paramfilename);
-      error->one(FLERR,str);
-    }
-  }
-
-  eof = 0;
-  while (1) {
-    if (comm->me == 0) {
-      ptr = fgets(line,MAXLINE,fpparam);
-      if (ptr == NULL) {
-        eof = 1;
-        fclose(fpparam);
-      } else n = strlen(line) + 1;
-    }
-    MPI_Bcast(&eof,1,MPI_INT,0,world);
-    if (eof) break;
-    MPI_Bcast(&n,1,MPI_INT,0,world);
-    MPI_Bcast(line,n,MPI_CHAR,0,world);
-
-    // strip comment, skip line if blank
-
-    if ((ptr = strchr(line,'#'))) *ptr = '\0';
-    nwords = atom->count_words(line);
-    if (nwords == 0) continue;
-
-    if (nwords != 2)
-      error->all(FLERR,"Incorrect format in SNAP parameter file");
-
-    // words = ptrs to all words in line
-    // strip single and double quotes from words
-
-    char* keywd = strtok(line,"' \t\n\r\f");
-    char* keyval = strtok(NULL,"' \t\n\r\f");
-
-    if (comm->me == 0) {
-      if (screen) fprintf(screen,"SNAP keyword %s %s \n",keywd,keyval);
-      if (logfile) fprintf(logfile,"SNAP keyword %s %s \n",keywd,keyval);
-    }
-
-    if (strcmp(keywd,"rcutfac") == 0) {
-      rcutfac = atof(keyval);
-      rcutfacflag = 1;
-    } else if (strcmp(keywd,"twojmax") == 0) {
-      twojmax = atoi(keyval);
-      twojmaxflag = 1;
-    } else if (strcmp(keywd,"rfac0") == 0)
-      rfac0 = atof(keyval);
-    else if (strcmp(keywd,"rmin0") == 0)
-      rmin0 = atof(keyval);
-    else if (strcmp(keywd,"diagonalstyle") == 0)
-      diagonalstyle = atoi(keyval);
-    else if (strcmp(keywd,"switchflag") == 0)
-      switchflag = atoi(keyval);
-    else if (strcmp(keywd,"bzeroflag") == 0)
-      bzeroflag = atoi(keyval);
-    else if (strcmp(keywd,"quadraticflag") == 0)
-      quadraticflag = atoi(keyval);
-    else
-      error->all(FLERR,"Incorrect SNAP parameter file");
-  }
-
-  if (rcutfacflag == 0 || twojmaxflag == 0)
-    error->all(FLERR,"Incorrect SNAP parameter file");
-
-}
-
-/* ----------------------------------------------------------------------
-   memory usage
-------------------------------------------------------------------------- */
-
-double PairNNSNAP::memory_usage()
-{
-  double bytes = Pair::memory_usage();
-  int n = atom->ntypes+1;
-  bytes += n*n*sizeof(int);
-  bytes += n*n*sizeof(double);
-  bytes += 3*nmax*sizeof(double);
-  bytes += nmax*sizeof(int);
-  bytes += (2*ncoeffall)*sizeof(double);
-  bytes += (ncoeff*3)*sizeof(double);
-  bytes += sna[0]->memory_usage()*nthreads;
-  return bytes;
-}
-

From be5d3d6a19345a7378e2df0eaea3b7ad6e1ac660 Mon Sep 17 00:00:00 2001
From: athomps <athomps@sandia.gov>
Date: Wed, 12 Jun 2019 17:04:22 -0600
Subject: [PATCH 16/21] Not part of this pull request

---
 examples/snap/in.nnsnap | 45 -----------------------------------------
 1 file changed, 45 deletions(-)
 delete mode 100644 examples/snap/in.nnsnap

diff --git a/examples/snap/in.nnsnap b/examples/snap/in.nnsnap
deleted file mode 100644
index d575757d56..0000000000
--- a/examples/snap/in.nnsnap
+++ /dev/null
@@ -1,45 +0,0 @@
-# Demonstrate SNAP Ta potential
-
-# Initialize simulation
-
-variable nsteps index 100
-variable nrep equal 4
-variable a equal 3.1803
-units		metal
-
-# generate the box and atom positions using a BCC lattice
-
-variable nx equal ${nrep}
-variable ny equal ${nrep}
-variable nz equal ${nrep}
-
-boundary	p p p
-
-lattice         bcc $a
-region		box block 0 ${nx} 0 ${ny} 0 ${nz}
-create_box	1 box
-create_atoms	1 box
-
-mass 1 183.84
-
-# choose potential
-
-include W.nnsnap
-
-# Setup output
-
-thermo		10
-thermo_modify norm yes
-
-# Set up NVE run
-
-timestep 0.5e-3
-neighbor 1.0 bin
-neigh_modify once no every 1 delay 0 check yes
-
-# Run MD
-
-velocity all create 300.0 4928459
-fix 1 all nve
-run             ${nsteps}
-

From c5c03230cb2a1a8649f902b2dce0983c5a309565 Mon Sep 17 00:00:00 2001
From: athomps <athomps@sandia.gov>
Date: Wed, 12 Jun 2019 17:05:47 -0600
Subject: [PATCH 17/21] Not part of this pull request

---
 src/SNAP/pair_nn_snap.h | 184 ----------------------------------------
 1 file changed, 184 deletions(-)
 delete mode 100644 src/SNAP/pair_nn_snap.h

diff --git a/src/SNAP/pair_nn_snap.h b/src/SNAP/pair_nn_snap.h
deleted file mode 100644
index f77ddee207..0000000000
--- a/src/SNAP/pair_nn_snap.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(nn/snap,PairNNSNAP)
-
-#else
-
-#ifndef LMP_PAIR_NN_SNAP_H
-#define LMP_PAIR_NN_SNAP_H
-
-#include "pair.h"
-
-namespace LAMMPS_NS {
-
-class PairNNSNAP : public Pair {
-public:
-  PairNNSNAP(class LAMMPS *);
-  ~PairNNSNAP();
-  virtual void compute(int, int);
-  void compute_regular(int, int);
-  void compute_optimized(int, int);
-  void settings(int, char **);
-  virtual void coeff(int, char **);
-  virtual void init_style();
-  virtual double init_one(int, int);
-  virtual double memory_usage();
-
-  double rcutfac, quadraticflag; // declared public to workaround gcc 4.9
-  int ncoeff;                    //  compiler bug, manifest in KOKKOS package
-
-protected:
-  int ncoeffq, ncoeffall;
-  double **bvec, ***dbvec;
-  class SNA** sna;
-  int nmax;
-  int nthreads;
-  virtual void allocate();
-  void read_files(char *, char *);
-  inline int equal(double* x,double* y);
-  inline double dist2(double* x,double* y);
-  double extra_cutoff();
-  void load_balance();
-  void set_sna_to_shared(int snaid,int i);
-  void build_per_atom_arrays();
-
-  void compute_beta();
-
-  int schedule_user;
-  double schedule_time_guided;
-  double schedule_time_dynamic;
-
-  int ncalls_neigh;
-  int do_load_balance;
-  int ilistmask_max;
-  int* ilistmask;
-  int ghostinum;
-  int ghostilist_max;
-  int* ghostilist;
-  int ghostnumneigh_max;
-  int* ghostnumneigh;
-  int* ghostneighs;
-  int* ghostfirstneigh;
-  int ghostneighs_total;
-  int ghostneighs_max;
-
-  int use_optimized;
-  int use_shared_arrays;
-
-  int i_max;
-  int i_neighmax;
-  int i_numpairs;
-  int **i_pairs;
-  double ***i_rij;
-  int **i_inside;
-  double **i_wj;
-  double **i_rcutij;
-  int *i_ninside;
-  double ****i_uarraytot_r, ****i_uarraytot_i;
-  double ******i_zarray_r, ******i_zarray_i;
-
-#ifdef TIMING_INFO
-  //  timespec starttime, endtime;
-  double timers[4];
-#endif
-
-  double rcutmax;               // max cutoff for all elements
-  int nelements;                // # of unique elements
-  char **elements;              // names of unique elements
-  double *radelem;              // element radii
-  double *wjelem;               // elements weights
-  double **coeffelem;           // element bispectrum coefficients
-  double** beta;                // betas for all atoms in list
-  int *map;                     // mapping from atom types to elements
-  int twojmax, diagonalstyle, switchflag, bzeroflag;
-  double rfac0, rmin0, wj1, wj2;
-  int rcutfacflag, twojmaxflag; // flags for required parameters
-  int beta_max;                 // length of beta
-};
-
-}
-
-#endif
-#endif
-
-/* ERROR/WARNING messages:
-
-E: Communication cutoff too small for SNAP micro load balancing
-
-This can happen if you change the neighbor skin after your pair_style
-command or if your box dimensions grow during a run. You can set the
-cutoff explicitly via the comm_modify cutoff command.
-
-E: Illegal ... command
-
-Self-explanatory.  Check the input script syntax and compare to the
-documentation for the command.  You can use -echo screen as a
-command-line option when running LAMMPS to see the offending line.
-
-E: Must set number of threads via package omp command
-
-Because you are using the USER-OMP package, set the number of threads
-via its settings, not by the pair_style snap nthreads setting.
-
-W: Communication cutoff is too small for SNAP micro load balancing, increased to %lf
-
-Self-explanatory.
-
-E: Incorrect args for pair coefficients
-
-Self-explanatory.  Check the input script or data file.
-
-E: Incorrect SNAP coeff file
-
-UNDOCUMENTED
-
-E: Incorrect SNAP parameter file
-
-The file cannot be parsed correctly, check its internal syntax.
-
-E: Pair style SNAP requires newton pair on
-
-See the newton command.  This is a restriction to use the SNAP
-potential.
-
-E: All pair coeffs are not set
-
-All pair coefficients must be set in the data file or by the
-pair_coeff command before running a simulation.
-
-E: Cannot open SNAP coefficient file %s
-
-The specified SNAP coefficient file cannot be opened.  Check that the
-path and name are correct.
-
-E: Incorrect format in SNAP coefficient file
-
-Incorrect number of words per line in the coefficient file.
-
-E: Cannot open SNAP parameter file %s
-
-The specified SNAP parameter file cannot be opened.  Check that the
-path and name are correct.
-
-E: Incorrect format in SNAP parameter file
-
-Incorrect number of words per line in the parameter file.
-
-E: Did not find all elements in SNAP coefficient file.
-
-One or more elements listed in the pair_coeff command were not found in the coefficient file.
-
-*/

From 65b87fa2781c5f7dd9a4599193c026d8b37825ae Mon Sep 17 00:00:00 2001
From: "Aidan P. Thompson" <athomps@chama-login6.sandia.gov>
Date: Thu, 13 Jun 2019 09:54:56 -0600
Subject: [PATCH 18/21] Updated SNAP in KOKKOS package so it compiles and runs

---
 src/KOKKOS/pair_snap_kokkos_impl.h | 18 +-----
 src/KOKKOS/sna_kokkos.h            |  8 +--
 src/KOKKOS/sna_kokkos_impl.h       | 97 ++++++++++++------------------
 src/SNAP/sna.cpp                   |  3 -
 4 files changed, 39 insertions(+), 87 deletions(-)

diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h
index bb2a5e9171..0ec4ed0995 100644
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@@ -85,9 +85,6 @@ void PairSNAPKokkos<DeviceType>::init_style()
   if (force->newton_pair == 0)
     error->all(FLERR,"Pair style SNAP requires newton pair on");
 
-  if (diagonalstyle != 3)
-    error->all(FLERR,"Must use diagonal style = 3 with pair snap/kk");
-
   // irequest = neigh request made by parent class
 
   neighflag = lmp->kokkos->neighflag;
@@ -343,23 +340,12 @@ void PairSNAPKokkos<DeviceType>::coeff(int narg, char **arg)
   Kokkos::deep_copy(d_coeffelem,h_coeffelem);
   Kokkos::deep_copy(d_map,h_map);
 
-  // deallocate non-kokkos sna
-
-  if (sna) {
-    for (int tid = 0; tid<nthreads; tid++)
-      delete sna[tid];
-    delete [] sna;
-    sna = NULL;
-  }
-
   // allocate memory for per OpenMP thread data which
   // is wrapped into the sna class
 
   snaKK = SNAKokkos<DeviceType>(rfac0,twojmax,
-                  diagonalstyle,use_shared_arrays,
                   rmin0,switchflag,bzeroflag);
-    //if (!use_shared_arrays)
-  snaKK.grow_rij(nmax);
+  snaKK.grow_rij(0);
   snaKK.init();
 }
 
@@ -667,8 +653,6 @@ double PairSNAPKokkos<DeviceType>::memory_usage()
   int n = atom->ntypes+1;
   bytes += n*n*sizeof(int);
   bytes += n*n*sizeof(double);
-  bytes += 3*nmax*sizeof(double);
-  bytes += nmax*sizeof(int);
   bytes += (2*ncoeffall)*sizeof(double);
   bytes += (ncoeff*3)*sizeof(double);
   bytes += snaKK.memory_usage();
diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h
index 7a80b262b7..40e5fe0ad4 100644
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@@ -48,7 +48,7 @@ inline
   SNAKokkos(const SNAKokkos<DeviceType>& sna, const typename Kokkos::TeamPolicy<DeviceType>::member_type& team);
 
 inline
-  SNAKokkos(double, int, int, int, double, int, int);
+  SNAKokkos(double, int, double, int, int);
 
   KOKKOS_INLINE_FUNCTION
   ~SNAKokkos();
@@ -178,12 +178,6 @@ inline
                        double, double, double, // compute_duidrj
                        double, double, double, double, double);
 
-  // if number of atoms are small use per atom arrays
-  // for twojmax arrays, rij, inside, bvec
-  // this will increase the memory footprint considerably,
-  // but allows parallel filling and reuse of these arrays
-  int use_shared_arrays;
-
   // Sets the style for the switching function
   // 0 = none
   // 1 = cosine
diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index 0f2a450a3d..c43003af97 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -27,19 +27,17 @@ static const double MY_PI  = 3.14159265358979323846; // pi
 template<class DeviceType>
 inline
 SNAKokkos<DeviceType>::SNAKokkos(double rfac0_in,
-         int twojmax_in, int diagonalstyle_in, int use_shared_arrays_in,
+         int twojmax_in,
          double rmin0_in, int switch_flag_in, int bzero_flag_in)
 {
   wself = 1.0;
 
-  use_shared_arrays = use_shared_arrays_in;
   rfac0 = rfac0_in;
   rmin0 = rmin0_in;
   switch_flag = switch_flag_in;
   bzero_flag = bzero_flag_in;
 
   twojmax = twojmax_in;
-  diagonalstyle = diagonalstyle_in;
 
   ncoeff = compute_ncoeff();
 
@@ -70,14 +68,12 @@ KOKKOS_INLINE_FUNCTION
 SNAKokkos<DeviceType>::SNAKokkos(const SNAKokkos<DeviceType>& sna, const typename Kokkos::TeamPolicy<DeviceType>::member_type& team) {
   wself = sna.wself;
 
-  use_shared_arrays = sna.use_shared_arrays;
   rfac0 = sna.rfac0;
   rmin0 = sna.rmin0;
   switch_flag = sna.switch_flag;
   bzero_flag = sna.bzero_flag;
 
   twojmax = sna.twojmax;
-  diagonalstyle = sna.diagonalstyle;
 
   ncoeff = sna.ncoeff;
   nmax = sna.nmax;
@@ -104,48 +100,45 @@ template<class DeviceType>
 inline
 void SNAKokkos<DeviceType>::build_indexlist()
 {
-  if(diagonalstyle == 3) {
-    int idxj_count = 0;
-    int idxj_full_count = 0;
+  int idxj_count = 0;
+  int idxj_full_count = 0;
 
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
-          if (j >= j1) idxj_count++;
-          idxj_full_count++;
-        }
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+	if (j >= j1) idxj_count++;
+	idxj_full_count++;
+      }
 
-    // indexList can be changed here
+  // indexList can be changed here
 
-    idxj = Kokkos::View<SNAKK_LOOPINDICES*, DeviceType>("SNAKokkos::idxj",idxj_count);
-    idxj_full = Kokkos::View<SNAKK_LOOPINDICES*, DeviceType>("SNAKokkos::idxj_full",idxj_full_count);
-    auto h_idxj = Kokkos::create_mirror_view(idxj);
-    auto h_idxj_full = Kokkos::create_mirror_view(idxj_full);
+  idxj = Kokkos::View<SNAKK_LOOPINDICES*, DeviceType>("SNAKokkos::idxj",idxj_count);
+  idxj_full = Kokkos::View<SNAKK_LOOPINDICES*, DeviceType>("SNAKokkos::idxj_full",idxj_full_count);
+  auto h_idxj = Kokkos::create_mirror_view(idxj);
+  auto h_idxj_full = Kokkos::create_mirror_view(idxj_full);
 
-    idxj_max = idxj_count;
-    idxj_full_max = idxj_full_count;
+  idxj_max = idxj_count;
+  idxj_full_max = idxj_full_count;
 
-    idxj_count = 0;
-    idxj_full_count = 0;
+  idxj_count = 0;
+  idxj_full_count = 0;
 
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
-          if (j >= j1) {
-            h_idxj[idxj_count].j1 = j1;
-            h_idxj[idxj_count].j2 = j2;
-            h_idxj[idxj_count].j = j;
-            idxj_count++;
-          }
-          h_idxj_full[idxj_full_count].j1 = j1;
-          h_idxj_full[idxj_full_count].j2 = j2;
-          h_idxj_full[idxj_full_count].j = j;
-          idxj_full_count++;
-        }
-    Kokkos::deep_copy(idxj,h_idxj);
-    Kokkos::deep_copy(idxj_full,h_idxj_full);
-
-  }
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+	if (j >= j1) {
+	  h_idxj[idxj_count].j1 = j1;
+	  h_idxj[idxj_count].j2 = j2;
+	  h_idxj[idxj_count].j = j;
+	  idxj_count++;
+	}
+	h_idxj_full[idxj_full_count].j1 = j1;
+	h_idxj_full[idxj_full_count].j2 = j2;
+	h_idxj_full[idxj_full_count].j = j;
+	idxj_full_count++;
+      }
+  Kokkos::deep_copy(idxj,h_idxj);
+  Kokkos::deep_copy(idxj_full,h_idxj_full);
 
 }
 /* ---------------------------------------------------------------------- */
@@ -1223,26 +1216,10 @@ int SNAKokkos<DeviceType>::compute_ncoeff()
   ncount = 0;
 
   for (int j1 = 0; j1 <= twojmax; j1++)
-    if(diagonalstyle == 0) {
-      for (int j2 = 0; j2 <= j1; j2++)
-        for (int j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-          ncount++;
-    } else if(diagonalstyle == 1) {
-      int j2 = j1;
-
+    for (int j2 = 0; j2 <= j1; j2++)
       for (int j = abs(j1 - j2);
-          j <= MIN(twojmax, j1 + j2); j += 2)
-        ncount++;
-    } else if(diagonalstyle == 2) {
-      ncount++;
-    } else if(diagonalstyle == 3) {
-      for (int j2 = 0; j2 <= j1; j2++)
-        for (int j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-          if (j >= j1) ncount++;
-    }
-
+	   j <= MIN(twojmax, j1 + j2); j += 2)
+	if (j >= j1) ncount++;
   return ncount;
 }
 
diff --git a/src/SNAP/sna.cpp b/src/SNAP/sna.cpp
index 75601b8e17..fd25d35677 100644
--- a/src/SNAP/sna.cpp
+++ b/src/SNAP/sna.cpp
@@ -1208,9 +1208,6 @@ double SNA::memory_usage()
   bytes += nmax * sizeof(double);                        // wj
   bytes += nmax * sizeof(double);                        // rcutij
 
-  printf("SNAP Z list Memory Usage %d\n",idxz_max * sizeof(double) * 2);
-  printf("SNAP CG list Memory Usage %d\n",idxcg_max * sizeof(double));
-
   return bytes;
 }
 /* ---------------------------------------------------------------------- */

From 67a1a63f5fec8a68f460e07d0c4768838cf83ce2 Mon Sep 17 00:00:00 2001
From: Aidan Thompson <athomps@sandia.gov>
Date: Thu, 13 Jun 2019 10:10:37 -0600
Subject: [PATCH 19/21] Removed old text

---
 doc/src/compute_sna_atom.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/src/compute_sna_atom.txt b/doc/src/compute_sna_atom.txt
index 10e68f5698..9dca6b1c6f 100644
--- a/doc/src/compute_sna_atom.txt
+++ b/doc/src/compute_sna_atom.txt
@@ -25,7 +25,6 @@ R_1, R_2,... = list of cutoff radii, one for each type (distance units) :l
 w_1, w_2,... = list of neighbor weights, one for each type  :l
 zero or more keyword/value pairs may be appended :l
 keyword = {rmin0} or {switchflag} or {bzeroflag} or {quadraticflag} :l
-//     {3} = subset satisfying j2 <= j1 <= j
   {rmin0} value = parameter in distance to angle conversion (distance units)
   {switchflag} value = {0} or {1}
      {0} = do not use switching function

From 5fb505ca8cbb7f0a8be96ae400eff1a0802eff98 Mon Sep 17 00:00:00 2001
From: Aidan Thompson <athomps@sandia.gov>
Date: Thu, 13 Jun 2019 10:24:18 -0600
Subject: [PATCH 20/21] Fixed typo

---
 doc/src/compute_sna_atom.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/compute_sna_atom.txt b/doc/src/compute_sna_atom.txt
index 9dca6b1c6f..518d28aec9 100644
--- a/doc/src/compute_sna_atom.txt
+++ b/doc/src/compute_sna_atom.txt
@@ -200,7 +200,7 @@ for the number of bispectrum components was removed in 2019,
 since all potentials use the value of 3, corresponding to the
 above set of bispectrum components.
 
-ompute {snad/atom} evaluates a per-atom array. The columns are
+Compute {snad/atom} evaluates a per-atom array. The columns are
 arranged into {ntypes} blocks, listed in order of atom type {I}.  Each
 block contains three sub-blocks corresponding to the {x}, {y}, and {z}
 components of the atom position.  Each of these sub-blocks contains

From 4d7d3a5d530c7b847dd08ca62aa7d091a94e6139 Mon Sep 17 00:00:00 2001
From: Aidan Thompson <athomps@sandia.gov>
Date: Thu, 13 Jun 2019 15:56:18 -0600
Subject: [PATCH 21/21] Switched algorithm for compute_yi to one based on zlist
 ordering

---
 src/SNAP/sna.cpp | 155 +++++++++++++++++++++++------------------------
 1 file changed, 76 insertions(+), 79 deletions(-)

diff --git a/src/SNAP/sna.cpp b/src/SNAP/sna.cpp
index fd25d35677..ec545c51b2 100644
--- a/src/SNAP/sna.cpp
+++ b/src/SNAP/sna.cpp
@@ -398,13 +398,18 @@ void SNA::compute_zi()
 }
 
 /* ----------------------------------------------------------------------
-   compute Yi from Ui without storing Zi, looping over ylist
+   compute Yi from Ui without storing Zi, looping over zlist indices
 ------------------------------------------------------------------------- */
 
 void SNA::compute_yi(const double* beta)
 {
+  int j;
+  int jjz;
+  int jju;
+  double betaj;
+
   for(int j = 0; j <= twojmax; j++) {
-    int jju = idxu_block[j];
+    jju = idxu_block[j];
     for(int mb = 0; 2*mb <= j; mb++)
       for(int ma = 0; ma <= j; ma++) {
         ylist_r[jju] = 0.0;
@@ -413,93 +418,85 @@ void SNA::compute_yi(const double* beta)
       } // end loop over ma, mb
   } // end loop over j
 
-  for(int jjb = 0; jjb < idxb_max; jjb++) {
-    const int j1b = idxb[jjb].j1;
-    const int j2b = idxb[jjb].j2;
-    const int j3b = idxb[jjb].j;
+  int ma2, mb2;
+  for(int jjz = 0; jjz < idxz_max; jjz++) {
+    const int j1 = idxz[jjz].j1;
+    const int j2 = idxz[jjz].j2;
+    const int j = idxz[jjz].j;
+    const int ma1min = idxz[jjz].ma1min;
+    const int ma2max = idxz[jjz].ma2max;
+    const int na = idxz[jjz].na;
+    const int mb1min = idxz[jjz].mb1min;
+    const int mb2max = idxz[jjz].mb2max;
+    const int nb = idxz[jjz].nb;
 
-    compute_yterm(j1b,j2b,j3b,beta);
-    compute_yterm(j3b,j2b,j1b,beta);
-    compute_yterm(j3b,j1b,j2b,beta);
+    const double* cgblock = cglist + idxcg_block[j1][j2][j];
+    int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
+    int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2;
 
-  } // end loop over jjb
+    double ztmp_r = 0.0;
+    double ztmp_i = 0.0;
 
-}
+    int jju1 = idxu_block[j1] + (j1+1)*mb1min;
+    int jju2 = idxu_block[j2] + (j2+1)*mb2max;
+    int icgb = mb1min*(j2+1) + mb2max;
+    for(int ib = 0; ib < nb; ib++) {
 
-void SNA::compute_yterm(int j1, int j2, int j, const double* beta) {
-  double betaj;
+      double suma1_r = 0.0;
+      double suma1_i = 0.0;
 
-  int jju = idxu_block[j];
-  int jjz = idxz_block[j1][j2][j];
+      const double* u1_r = &ulisttot_r[jju1];
+      const double* u1_i = &ulisttot_i[jju1];
+      const double* u2_r = &ulisttot_r[jju2];
+      const double* u2_i = &ulisttot_i[jju2];
+
+      int ma1 = ma1min;
+      int ma2 = ma2max;
+      int icga = ma1min*(j2+1) + ma2max;
+
+      for(int ia = 0; ia < na; ia++) {
+        suma1_r += cgblock[icga] * (u1_r[ma1] * u2_r[ma2] - u1_i[ma1] * u2_i[ma2]);
+        suma1_i += cgblock[icga] * (u1_r[ma1] * u2_i[ma2] + u1_i[ma1] * u2_r[ma2]);
+        ma1++;
+        ma2--;
+        icga += j2;
+      } // end loop over ia
+
+      ztmp_r += cgblock[icgb] * suma1_r;
+      ztmp_i += cgblock[icgb] * suma1_i;
+      jju1 += j1+1;
+      jju2 -= j2+1;
+      icgb += j2;
+    } // end loop over ib
+
+    // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
+    // find right y_list[jju] and beta[jjb] entries
+    // multiply and divide by j+1 factors
+    // account for multiplicity of 1, 2, or 3
+
+    const int jju = idxz[jjz].jju;
 
   // pick out right beta value
 
-  if (j >= j1) {
-    const int jjb = idxb_block[j1][j2][j];
-    betaj = beta[jjb];
-  } else if (j >= j2) {
-    const int jjb = idxb_block[j][j2][j1];
-    betaj = beta[jjb]*(j1+1)/(j+1.0);
-  } else {
-    const int jjb = idxb_block[j2][j][j1];
-    betaj = beta[jjb]*(j1+1)/(j+1.0);
-  }
+    if (j >= j1) {
+      const int jjb = idxb_block[j1][j2][j];
+      if (j1 == j) {
+        if (j2 == j) betaj = 3*beta[jjb];
+        else betaj = 2*beta[jjb];
+      } else betaj = beta[jjb]; 
+    } else if (j >= j2) {
+      const int jjb = idxb_block[j][j2][j1];
+      if (j2 == j) betaj = 2*beta[jjb]*(j1+1)/(j+1.0);
+      else betaj = beta[jjb]*(j1+1)/(j+1.0);
+    } else {
+      const int jjb = idxb_block[j2][j][j1];
+      betaj = beta[jjb]*(j1+1)/(j+1.0);
+    }
 
-  // can replace this with a single loop over jjz
+    ylist_r[jju] += betaj*ztmp_r;
+    ylist_i[jju] += betaj*ztmp_i;
 
-  for (int mb = 0; 2*mb <= j; mb++)
-    for (int ma = 0; ma <= j; ma++) {
-
-      const int ma1min = idxz[jjz].ma1min;
-      const int ma2max = idxz[jjz].ma2max;
-      const int na = idxz[jjz].na;
-      const int mb1min = idxz[jjz].mb1min;
-      const int mb2max = idxz[jjz].mb2max;
-      const int nb = idxz[jjz].nb;
-      
-      const double* cgblock = cglist + idxcg_block[j1][j2][j];
-
-      double ztmp_r = 0.0;
-      double ztmp_i = 0.0;
-
-      int jju1 = idxu_block[j1] + (j1+1)*mb1min;
-      int jju2 = idxu_block[j2] + (j2+1)*mb2max;
-      int icgb = mb1min*(j2+1) + mb2max;
-      for(int ib = 0; ib < nb; ib++) {
-
-        double suma1_r = 0.0;
-        double suma1_i = 0.0;
-
-        const double* u1_r = &ulisttot_r[jju1];
-        const double* u1_i = &ulisttot_i[jju1];
-        const double* u2_r = &ulisttot_r[jju2];
-        const double* u2_i = &ulisttot_i[jju2];
-
-        int ma1 = ma1min;
-        int ma2 = ma2max;
-        int icga = ma1min*(j2+1) + ma2max;
-          
-        for(int ia = 0; ia < na; ia++) {
-          suma1_r += cgblock[icga] * (u1_r[ma1] * u2_r[ma2] - u1_i[ma1] * u2_i[ma2]);
-          suma1_i += cgblock[icga] * (u1_r[ma1] * u2_i[ma2] + u1_i[ma1] * u2_r[ma2]);
-          ma1++;
-          ma2--;
-          icga += j2;
-        } // end loop over ia
-
-        ztmp_r += cgblock[icgb] * suma1_r;
-        ztmp_i += cgblock[icgb] * suma1_i;
-        jju1 += j1+1;
-        jju2 -= j2+1;
-        icgb += j2;
-      } // end loop over ib
-
-      // printf("jju betaj ztmp ylist %d %g %g %d %d %d %d %d\n",jju,betaj,ztmp_r,j1,j2,j,ma,mb);
-      ylist_r[jju] += betaj*ztmp_r;
-      ylist_i[jju] += betaj*ztmp_i;
-      jjz++;
-      jju++;
-    } // end loop over ma, mb
+  } // end loop over jjz
 }
 
 /* ----------------------------------------------------------------------