Added compact arrays, removed unused openmp stuff

2019-06-03 19:50:40 -06:00 · 2019-06-03 19:50:40 -06:00 · 960a975e2a
parent 803e0631c5
commit 960a975e2a
11 changed files with 332 additions and 1614 deletions
--- a/src/SNAP/compute_sna_atom.cpp
+++ b/src/SNAP/compute_sna_atom.cpp
@ -25,7 +25,6 @@
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
-#include "openmp_snap.h"

 using namespace LAMMPS_NS;

@ -115,20 +114,10 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
    } else error->all(FLERR,"Illegal compute sna/atom command");
  }

-  nthreads = comm->nthreads;
-  snaptr = new SNA*[nthreads];
-#if defined(_OPENMP)
-#pragma omp parallel default(none) shared(lmp,rfac0,twojmax,rmin0,switchflag,bzeroflag)
-#endif
-  {
-    int tid = omp_get_thread_num();
+  snaptr = new SNA(lmp,rfac0,twojmax,diagonalstyle,
+                   rmin0,switchflag,bzeroflag);

-    // always unset use_shared_arrays since it does not work with computes
-    snaptr[tid] = new SNA(lmp,rfac0,twojmax,diagonalstyle,
-                          0 /*use_shared_arrays*/, rmin0,switchflag,bzeroflag);
-  }
-
-  ncoeff = snaptr[0]->ncoeff;
+  ncoeff = snaptr->ncoeff;
  size_peratom_cols = ncoeff;
  if (quadraticflag) size_peratom_cols += (ncoeff*(ncoeff+1))/2;
  peratom_flag = 1;
@ -136,7 +125,6 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
  nmax = 0;
  njmax = 0;
  sna = NULL;
-
 }

 /* ---------------------------------------------------------------------- */
@ -147,9 +135,7 @@ ComputeSNAAtom::~ComputeSNAAtom()
  memory->destroy(radelem);
  memory->destroy(wjelem);
  memory->destroy(cutsq);
-  for (int tid = 0; tid<nthreads; tid++)
-    delete snaptr[tid];
-  delete [] snaptr;
+  delete snaptr;
 }

 /* ---------------------------------------------------------------------- */
@ -176,13 +162,7 @@ void ComputeSNAAtom::init()
    if (strcmp(modify->compute[i]->style,"sna/atom") == 0) count++;
  if (count > 1 && comm->me == 0)
    error->warning(FLERR,"More than one compute sna/atom");
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    snaptr[tid]->init();
-  }
+  snaptr->init();
 }

 /* ---------------------------------------------------------------------- */
@ -223,11 +203,7 @@ void ComputeSNAAtom::compute_peratom()
  double** const x = atom->x;
  const int* const mask = atom->mask;

-#if defined(_OPENMP)
-#pragma omp parallel for default(none)
-#endif
  for (int ii = 0; ii < inum; ii++) {
-    const int tid = omp_get_thread_num();
    const int i = ilist[ii];
    if (mask[i] & groupbit) {

@ -241,7 +217,7 @@ void ComputeSNAAtom::compute_peratom()

      // insure rij, inside, and typej  are of size jnum

-      snaptr[tid]->grow_rij(jnum);
+      snaptr->grow_rij(jnum);

      // rij[][3] = displacements between atom I and those neighbors
      // inside = indices of neighbors of I within cutoff
@ -258,26 +234,26 @@ void ComputeSNAAtom::compute_peratom()
        const double rsq = delx*delx + dely*dely + delz*delz;
        int jtype = type[j];
        if (rsq < cutsq[itype][jtype] && rsq>1e-20) {
-          snaptr[tid]->rij[ninside][0] = delx;
-          snaptr[tid]->rij[ninside][1] = dely;
-          snaptr[tid]->rij[ninside][2] = delz;
-          snaptr[tid]->inside[ninside] = j;
-          snaptr[tid]->wj[ninside] = wjelem[jtype];
-          snaptr[tid]->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
+          snaptr->rij[ninside][0] = delx;
+          snaptr->rij[ninside][1] = dely;
+          snaptr->rij[ninside][2] = delz;
+          snaptr->inside[ninside] = j;
+          snaptr->wj[ninside] = wjelem[jtype];
+          snaptr->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
          ninside++;
        }
      }

-      snaptr[tid]->compute_ui(ninside);
-      snaptr[tid]->compute_zi();
-      snaptr[tid]->compute_bi();
-      snaptr[tid]->copy_bi2bvec();
+      snaptr->compute_ui(ninside);
+      snaptr->compute_zi();
+      snaptr->compute_bi();
+      snaptr->copy_bi2bvec();
      for (int icoeff = 0; icoeff < ncoeff; icoeff++)
-        sna[i][icoeff] = snaptr[tid]->bvec[icoeff];
+        sna[i][icoeff] = snaptr->bvec[icoeff];
      if (quadraticflag) {
        int ncount = ncoeff;
        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          double bi = snaptr[tid]->bvec[icoeff];
+          double bi = snaptr->bvec[icoeff];

          // diagonal element of quadratic matrix

@ -286,7 +262,7 @@ void ComputeSNAAtom::compute_peratom()
          // upper-triangular elements of quadratic matrix

          for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++)
-            sna[i][ncount++] = bi*snaptr[tid]->bvec[jcoeff];
+            sna[i][ncount++] = bi*snaptr->bvec[jcoeff];
        }
      }
    } else {
@ -305,7 +281,6 @@ double ComputeSNAAtom::memory_usage()
  double bytes = nmax*size_peratom_cols * sizeof(double);
  bytes += 3*njmax*sizeof(double);
  bytes += njmax*sizeof(int);
-  bytes += snaptr[0]->memory_usage()*comm->nthreads;
  return bytes;
 }

--- a/src/SNAP/compute_sna_atom.h
+++ b/src/SNAP/compute_sna_atom.h
@ -42,10 +42,9 @@ class ComputeSNAAtom : public Compute {
  double rcutfac;
  double *radelem;
  double *wjelem;
-  class SNA** snaptr;
+  class SNA* snaptr;
  double cutmax;
  int quadraticflag;
-  int nthreads;
 };

 }
--- a/src/SNAP/compute_snad_atom.cpp
+++ b/src/SNAP/compute_snad_atom.cpp
@ -25,7 +25,6 @@
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
-#include "openmp_snap.h"

 using namespace LAMMPS_NS;

@ -113,20 +112,10 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
    } else error->all(FLERR,"Illegal compute snad/atom command");
  }

-  nthreads = comm->nthreads;
-  snaptr = new SNA*[nthreads];
-#if defined(_OPENMP)
-#pragma omp parallel default(none) shared(lmp,rfac0,twojmax,rmin0,switchflag,bzeroflag)
-#endif
-  {
-    int tid = omp_get_thread_num();
+  snaptr = new SNA(lmp,rfac0,twojmax,diagonalstyle,
+                   rmin0,switchflag,bzeroflag);

-    // always unset use_shared_arrays since it does not work with computes
-    snaptr[tid] = new SNA(lmp,rfac0,twojmax,diagonalstyle,
-                          0 /*use_shared_arrays*/, rmin0,switchflag,bzeroflag);
-  }
-
-  ncoeff = snaptr[0]->ncoeff;
+  ncoeff = snaptr->ncoeff;
  nperdim = ncoeff;
  if (quadraticflag) nperdim += (ncoeff*(ncoeff+1))/2;
  yoffset = nperdim;
@ -138,7 +127,6 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
  nmax = 0;
  njmax = 0;
  snad = NULL;
-
 }

 /* ---------------------------------------------------------------------- */
@ -149,9 +137,7 @@ ComputeSNADAtom::~ComputeSNADAtom()
  memory->destroy(radelem);
  memory->destroy(wjelem);
  memory->destroy(cutsq);
-  for (int tid = 0; tid<nthreads; tid++)
-    delete snaptr[tid];
-  delete [] snaptr;
+  delete snaptr;
 }

 /* ---------------------------------------------------------------------- */
@ -178,13 +164,7 @@ void ComputeSNADAtom::init()
    if (strcmp(modify->compute[i]->style,"snad/atom") == 0) count++;
  if (count > 1 && comm->me == 0)
    error->warning(FLERR,"More than one compute snad/atom");
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    snaptr[tid]->init();
-  }
+  snaptr->init();
 }

 /* ---------------------------------------------------------------------- */
@ -235,11 +215,7 @@ void ComputeSNADAtom::compute_peratom()
  double** const x = atom->x;
  const int* const mask = atom->mask;

-#if defined(_OPENMP)
-#pragma omp parallel for default(none)
-#endif
  for (int ii = 0; ii < inum; ii++) {
-    const int tid = omp_get_thread_num();
    const int i = ilist[ii];
    if (mask[i] & groupbit) {

@ -258,7 +234,7 @@ void ComputeSNADAtom::compute_peratom()

      // insure rij, inside, and typej  are of size jnum

-      snaptr[tid]->grow_rij(jnum);
+      snaptr->grow_rij(jnum);

      // rij[][3] = displacements between atom I and those neighbors
      // inside = indices of neighbors of I within cutoff
@ -276,30 +252,30 @@ void ComputeSNADAtom::compute_peratom()
        const double rsq = delx*delx + dely*dely + delz*delz;
        int jtype = type[j];
        if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-          snaptr[tid]->rij[ninside][0] = delx;
-          snaptr[tid]->rij[ninside][1] = dely;
-          snaptr[tid]->rij[ninside][2] = delz;
-          snaptr[tid]->inside[ninside] = j;
-          snaptr[tid]->wj[ninside] = wjelem[jtype];
-          snaptr[tid]->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
+          snaptr->rij[ninside][0] = delx;
+          snaptr->rij[ninside][1] = dely;
+          snaptr->rij[ninside][2] = delz;
+          snaptr->inside[ninside] = j;
+          snaptr->wj[ninside] = wjelem[jtype];
+          snaptr->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
          ninside++;
        }
      }

-      snaptr[tid]->compute_ui(ninside);
-      snaptr[tid]->compute_zi();
+      snaptr->compute_ui(ninside);
+      snaptr->compute_zi();
      if (quadraticflag) {
-        snaptr[tid]->compute_bi();
-        snaptr[tid]->copy_bi2bvec();
+        snaptr->compute_bi();
+        snaptr->copy_bi2bvec();
      }

      for (int jj = 0; jj < ninside; jj++) {
-        const int j = snaptr[tid]->inside[jj];
-        snaptr[tid]->compute_duidrj(snaptr[tid]->rij[jj],
-                                    snaptr[tid]->wj[jj],
-                                    snaptr[tid]->rcutij[jj]);
-        snaptr[tid]->compute_dbidrj();
-        snaptr[tid]->copy_dbi2dbvec();
+        const int j = snaptr->inside[jj];
+        snaptr->compute_duidrj(snaptr->rij[jj],
+                                    snaptr->wj[jj],
+                                    snaptr->rcutij[jj]);
+        snaptr->compute_dbidrj();
+        snaptr->copy_dbi2dbvec();

        // Accumulate -dBi/dRi, -dBi/dRj

@ -307,12 +283,12 @@ void ComputeSNADAtom::compute_peratom()
        double *snadj = snad[j]+typeoffset;

        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          snadi[icoeff] += snaptr[tid]->dbvec[icoeff][0];
-          snadi[icoeff+yoffset] += snaptr[tid]->dbvec[icoeff][1];
-          snadi[icoeff+zoffset] += snaptr[tid]->dbvec[icoeff][2];
-          snadj[icoeff] -= snaptr[tid]->dbvec[icoeff][0];
-          snadj[icoeff+yoffset] -= snaptr[tid]->dbvec[icoeff][1];
-          snadj[icoeff+zoffset] -= snaptr[tid]->dbvec[icoeff][2];
+          snadi[icoeff] += snaptr->dbvec[icoeff][0];
+          snadi[icoeff+yoffset] += snaptr->dbvec[icoeff][1];
+          snadi[icoeff+zoffset] += snaptr->dbvec[icoeff][2];
+          snadj[icoeff] -= snaptr->dbvec[icoeff][0];
+          snadj[icoeff+yoffset] -= snaptr->dbvec[icoeff][1];
+          snadj[icoeff+zoffset] -= snaptr->dbvec[icoeff][2];
        }

        if (quadraticflag) {
@ -321,10 +297,10 @@ void ComputeSNADAtom::compute_peratom()
          snadj += quadraticoffset;
          int ncount = 0;
          for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-            double bi = snaptr[tid]->bvec[icoeff];
-            double bix = snaptr[tid]->dbvec[icoeff][0];
-            double biy = snaptr[tid]->dbvec[icoeff][1];
-            double biz = snaptr[tid]->dbvec[icoeff][2];
+            double bi = snaptr->bvec[icoeff];
+            double bix = snaptr->dbvec[icoeff][0];
+            double biy = snaptr->dbvec[icoeff][1];
+            double biz = snaptr->dbvec[icoeff][2];

            // diagonal elements of quadratic matrix

@ -343,12 +319,12 @@ void ComputeSNADAtom::compute_peratom()
            // upper-triangular elements of quadratic matrix

            for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-              double dbxtmp = bi*snaptr[tid]->dbvec[jcoeff][0]
-                + bix*snaptr[tid]->bvec[jcoeff];
-              double dbytmp = bi*snaptr[tid]->dbvec[jcoeff][1]
-                + biy*snaptr[tid]->bvec[jcoeff];
-              double dbztmp = bi*snaptr[tid]->dbvec[jcoeff][2]
-                + biz*snaptr[tid]->bvec[jcoeff];
+              double dbxtmp = bi*snaptr->dbvec[jcoeff][0]
+                + bix*snaptr->bvec[jcoeff];
+              double dbytmp = bi*snaptr->dbvec[jcoeff][1]
+                + biy*snaptr->bvec[jcoeff];
+              double dbztmp = bi*snaptr->dbvec[jcoeff][2]
+                + biz*snaptr->bvec[jcoeff];

              snadi[ncount] +=         dbxtmp;
              snadi[ncount+yoffset] += dbytmp;
@ -408,6 +384,5 @@ double ComputeSNADAtom::memory_usage()
  bytes += 3*njmax*sizeof(double);
  bytes += njmax*sizeof(int);
  bytes += 3*nperdim*atom->ntypes;
-  bytes += snaptr[0]->memory_usage()*comm->nthreads;
  return bytes;
 }
--- a/src/SNAP/compute_snad_atom.h
+++ b/src/SNAP/compute_snad_atom.h
@ -44,10 +44,9 @@ class ComputeSNADAtom : public Compute {
  double rcutfac;
  double *radelem;
  double *wjelem;
-  class SNA** snaptr;
+  class SNA* snaptr;
  double cutmax;
  int quadraticflag;
-  int nthreads;
 };

 }
--- a/src/SNAP/compute_snav_atom.cpp
+++ b/src/SNAP/compute_snav_atom.cpp
@ -25,7 +25,6 @@
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
-#include "openmp_snap.h"

 using namespace LAMMPS_NS;

@ -109,20 +108,10 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
    } else error->all(FLERR,"Illegal compute snav/atom command");
  }

-  nthreads = comm->nthreads;
-  snaptr = new SNA*[nthreads];
-#if defined(_OPENMP)
-#pragma omp parallel default(none) shared(lmp,rfac0,twojmax,rmin0,switchflag,bzeroflag)
-#endif
-  {
-    int tid = omp_get_thread_num();
+  snaptr = new SNA(lmp,rfac0,twojmax,diagonalstyle,
+                   rmin0,switchflag,bzeroflag);

-    // always unset use_shared_arrays since it does not work with computes
-    snaptr[tid] = new SNA(lmp,rfac0,twojmax,diagonalstyle,
-                          0 /*use_shared_arrays*/, rmin0,switchflag,bzeroflag);
-  }
-
-  ncoeff = snaptr[0]->ncoeff;
+  ncoeff = snaptr->ncoeff;
  nperdim = ncoeff;
  if (quadraticflag) nperdim += (ncoeff*(ncoeff+1))/2;
  size_peratom_cols = 6*nperdim*atom->ntypes;
@ -144,9 +133,7 @@ ComputeSNAVAtom::~ComputeSNAVAtom()
  memory->destroy(wjelem);
  memory->destroy(cutsq);

-  for (int tid = 0; tid<nthreads; tid++)
-    delete snaptr[tid];
-  delete [] snaptr;
+  delete snaptr;
 }

 /* ---------------------------------------------------------------------- */
@ -174,13 +161,7 @@ void ComputeSNAVAtom::init()
    if (strcmp(modify->compute[i]->style,"snav/atom") == 0) count++;
  if (count > 1 && comm->me == 0)
    error->warning(FLERR,"More than one compute snav/atom");
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    snaptr[tid]->init();
-  }
+  snaptr->init();
 }

 /* ---------------------------------------------------------------------- */
@ -230,11 +211,7 @@ void ComputeSNAVAtom::compute_peratom()
  double** const x = atom->x;
  const int* const mask = atom->mask;

-#if defined(_OPENMP)
-#pragma omp parallel for default(none)
-#endif
  for (int ii = 0; ii < inum; ii++) {
-    const int tid = omp_get_thread_num();
    const int i = ilist[ii];
    if (mask[i] & groupbit) {

@ -251,7 +228,7 @@ void ComputeSNAVAtom::compute_peratom()

      // insure rij, inside, and typej  are of size jnum

-          snaptr[tid]->grow_rij(jnum);
+      snaptr->grow_rij(jnum);

      // rij[][3] = displacements between atom I and those neighbors
      // inside = indices of neighbors of I within cutoff
@ -269,31 +246,31 @@ void ComputeSNAVAtom::compute_peratom()
        const double rsq = delx*delx + dely*dely + delz*delz;
        int jtype = type[j];
        if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-          snaptr[tid]->rij[ninside][0] = delx;
-          snaptr[tid]->rij[ninside][1] = dely;
-          snaptr[tid]->rij[ninside][2] = delz;
-          snaptr[tid]->inside[ninside] = j;
-          snaptr[tid]->wj[ninside] = wjelem[jtype];
-          snaptr[tid]->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
+          snaptr->rij[ninside][0] = delx;
+          snaptr->rij[ninside][1] = dely;
+          snaptr->rij[ninside][2] = delz;
+          snaptr->inside[ninside] = j;
+          snaptr->wj[ninside] = wjelem[jtype];
+          snaptr->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
          ninside++;
        }
      }

-      snaptr[tid]->compute_ui(ninside);
-      snaptr[tid]->compute_zi();
+      snaptr->compute_ui(ninside);
+      snaptr->compute_zi();
      if (quadraticflag) {
-        snaptr[tid]->compute_bi();
-        snaptr[tid]->copy_bi2bvec();
+        snaptr->compute_bi();
+        snaptr->copy_bi2bvec();
      }

      for (int jj = 0; jj < ninside; jj++) {
-        const int j = snaptr[tid]->inside[jj];
+        const int j = snaptr->inside[jj];

-        snaptr[tid]->compute_duidrj(snaptr[tid]->rij[jj],
-                                    snaptr[tid]->wj[jj],
-                                    snaptr[tid]->rcutij[jj]);
-        snaptr[tid]->compute_dbidrj();
-        snaptr[tid]->copy_dbi2dbvec();
+        snaptr->compute_duidrj(snaptr->rij[jj],
+                                    snaptr->wj[jj],
+                                    snaptr->rcutij[jj]);
+        snaptr->compute_dbidrj();
+        snaptr->copy_dbi2dbvec();

        // Accumulate -dBi/dRi*Ri, -dBi/dRj*Rj

@ -301,18 +278,18 @@ void ComputeSNAVAtom::compute_peratom()
        double *snavj = snav[j]+typeoffset;

        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          snavi[icoeff]           += snaptr[tid]->dbvec[icoeff][0]*xtmp;
-          snavi[icoeff+nperdim]   += snaptr[tid]->dbvec[icoeff][1]*ytmp;
-          snavi[icoeff+2*nperdim] += snaptr[tid]->dbvec[icoeff][2]*ztmp;
-          snavi[icoeff+3*nperdim] += snaptr[tid]->dbvec[icoeff][1]*ztmp;
-          snavi[icoeff+4*nperdim] += snaptr[tid]->dbvec[icoeff][0]*ztmp;
-          snavi[icoeff+5*nperdim] += snaptr[tid]->dbvec[icoeff][0]*ytmp;
-          snavj[icoeff]           -= snaptr[tid]->dbvec[icoeff][0]*x[j][0];
-          snavj[icoeff+nperdim]   -= snaptr[tid]->dbvec[icoeff][1]*x[j][1];
-          snavj[icoeff+2*nperdim] -= snaptr[tid]->dbvec[icoeff][2]*x[j][2];
-          snavj[icoeff+3*nperdim] -= snaptr[tid]->dbvec[icoeff][1]*x[j][2];
-          snavj[icoeff+4*nperdim] -= snaptr[tid]->dbvec[icoeff][0]*x[j][2];
-          snavj[icoeff+5*nperdim] -= snaptr[tid]->dbvec[icoeff][0]*x[j][1];
+          snavi[icoeff]           += snaptr->dbvec[icoeff][0]*xtmp;
+          snavi[icoeff+nperdim]   += snaptr->dbvec[icoeff][1]*ytmp;
+          snavi[icoeff+2*nperdim] += snaptr->dbvec[icoeff][2]*ztmp;
+          snavi[icoeff+3*nperdim] += snaptr->dbvec[icoeff][1]*ztmp;
+          snavi[icoeff+4*nperdim] += snaptr->dbvec[icoeff][0]*ztmp;
+          snavi[icoeff+5*nperdim] += snaptr->dbvec[icoeff][0]*ytmp;
+          snavj[icoeff]           -= snaptr->dbvec[icoeff][0]*x[j][0];
+          snavj[icoeff+nperdim]   -= snaptr->dbvec[icoeff][1]*x[j][1];
+          snavj[icoeff+2*nperdim] -= snaptr->dbvec[icoeff][2]*x[j][2];
+          snavj[icoeff+3*nperdim] -= snaptr->dbvec[icoeff][1]*x[j][2];
+          snavj[icoeff+4*nperdim] -= snaptr->dbvec[icoeff][0]*x[j][2];
+          snavj[icoeff+5*nperdim] -= snaptr->dbvec[icoeff][0]*x[j][1];
        }

        if (quadraticflag) {
@ -321,10 +298,10 @@ void ComputeSNAVAtom::compute_peratom()
          snavj += quadraticoffset;
          int ncount = 0;
          for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-            double bi = snaptr[tid]->bvec[icoeff];
-            double bix = snaptr[tid]->dbvec[icoeff][0];
-            double biy = snaptr[tid]->dbvec[icoeff][1];
-            double biz = snaptr[tid]->dbvec[icoeff][2];
+            double bi = snaptr->bvec[icoeff];
+            double bix = snaptr->dbvec[icoeff][0];
+            double biy = snaptr->dbvec[icoeff][1];
+            double biz = snaptr->dbvec[icoeff][2];

            // diagonal element of quadratic matrix

@ -348,12 +325,12 @@ void ComputeSNAVAtom::compute_peratom()
            // upper-triangular elements of quadratic matrix

            for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-              double dbxtmp = bi*snaptr[tid]->dbvec[jcoeff][0]
-                + bix*snaptr[tid]->bvec[jcoeff];
-              double dbytmp = bi*snaptr[tid]->dbvec[jcoeff][1]
-                + biy*snaptr[tid]->bvec[jcoeff];
-              double dbztmp = bi*snaptr[tid]->dbvec[jcoeff][2]
-                + biz*snaptr[tid]->bvec[jcoeff];
+              double dbxtmp = bi*snaptr->dbvec[jcoeff][0]
+                + bix*snaptr->bvec[jcoeff];
+              double dbytmp = bi*snaptr->dbvec[jcoeff][1]
+                + biy*snaptr->bvec[jcoeff];
+              double dbztmp = bi*snaptr->dbvec[jcoeff][2]
+                + biz*snaptr->bvec[jcoeff];
              snavi[ncount] +=           dbxtmp*xtmp;
              snavi[ncount+nperdim] +=   dbytmp*ytmp;
              snavi[ncount+2*nperdim] += dbztmp*ztmp;
@ -419,6 +396,5 @@ double ComputeSNAVAtom::memory_usage()
  bytes += njmax*sizeof(int);
  bytes += 6*nperdim*atom->ntypes;
  if (quadraticflag) bytes += 6*nperdim*atom->ntypes;
-  bytes += snaptr[0]->memory_usage()*comm->nthreads;
  return bytes;
 }
--- a/src/SNAP/compute_snav_atom.h
+++ b/src/SNAP/compute_snav_atom.h
@ -44,9 +44,8 @@ class ComputeSNAVAtom : public Compute {
  double rcutfac;
  double *radelem;
  double *wjelem;
-  class SNA** snaptr;
+  class SNA* snaptr;
  int quadraticflag;
-  int nthreads;
 };

 }
--- a/src/SNAP/openmp_snap.h
+++ b/src/SNAP/openmp_snap.h
@ -1,16 +0,0 @@
-
-#ifndef LMP_OPENMP_SNAP_H
-#define LMP_OPENMP_SNAP_H
-
-#if defined(_OPENMP)
-#include <omp.h>
-#else
-enum omp_sched_t {omp_sched_static, omp_sched_dynamic, omp_sched_guided, omp_sched_auto};
-inline int omp_get_thread_num() { return 0;}
-inline int omp_set_num_threads(int num_threads) {return 1;}
-/* inline int __sync_fetch_and_add(int* ptr, int value) {int tmp = *ptr; ptr[0]+=value; return tmp;} */
-inline void omp_set_schedule(omp_sched_t schedule,int modifier=1) {}
-inline int omp_in_parallel() {return 0;}
-#endif
-
-#endif
--- a/src/SNAP/pair_snap.cpp
+++ b/src/SNAP/pair_snap.cpp
@ -23,7 +23,6 @@
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "sna.h"
-#include "openmp_snap.h"
 #include "domain.h"
 #include "memory.h"
 #include "error.h"
@ -55,51 +54,6 @@ PairSNAP::PairSNAP(LAMMPS *lmp) : Pair(lmp)
  coeffelem = NULL;

  nmax = 0;
-  nthreads = 1;
-
-  schedule_user = 0;
-  schedule_time_guided = -1;
-  schedule_time_dynamic = -1;
-  ncalls_neigh =-1;
-
-  ilistmask_max = 0;
-  ilistmask = NULL;
-  ghostinum = 0;
-  ghostilist_max = 0;
-  ghostilist = NULL;
-  ghostnumneigh_max = 0;
-  ghostnumneigh = NULL;
-  ghostneighs = NULL;
-  ghostfirstneigh = NULL;
-  ghostneighs_total = 0;
-  ghostneighs_max = 0;
-
-  i_max = 0;
-  i_neighmax = 0;
-  i_numpairs = 0;
-  i_rij = NULL;
-  i_inside = NULL;
-  i_wj = NULL;
-  i_rcutij = NULL;
-  i_ninside = NULL;
-  i_pairs = NULL;
-  i_uarraytot_r = NULL;
-  i_uarraytot_i = NULL;
-  i_zarray_r = NULL;
-  i_zarray_i = NULL;
-
-  use_shared_arrays = 0;
-
-#ifdef TIMING_INFO
-  timers[0] = 0;
-  timers[1] = 0;
-  timers[2] = 0;
-  timers[3] = 0;
-#endif
-
-  // Need to set this because restart not handled by PairHybrid
-
-  sna = NULL;

  beta_max = 0;
  beta = NULL;
@ -123,35 +77,7 @@ PairSNAP::~PairSNAP()
  memory->destroy(beta);
  memory->destroy(bispectrum);

-  // Need to set this because restart not handled by PairHybrid
-
-  if (sna) {
-
-#ifdef TIMING_INFO
-    double time[5];
-    double timeave[5];
-    double timeave_mpi[5];
-    double timemax_mpi[5];
-
-    for (int i = 0; i < 5; i++) {
-      time[i] = 0;
-      timeave[i] = 0;
-      for (int tid = 0; tid<nthreads; tid++) {
-        if (sna[tid]->timers[i]>time[i])
-          time[i] = sna[tid]->timers[i];
-        timeave[i] += sna[tid]->timers[i];
-      }
-      timeave[i] /= nthreads;
-    }
-    MPI_Reduce(timeave, timeave_mpi, 5, MPI_DOUBLE, MPI_SUM, 0, world);
-    MPI_Reduce(time, timemax_mpi, 5, MPI_DOUBLE, MPI_MAX, 0, world);
-#endif
-
-    for (int tid = 0; tid<nthreads; tid++)
-      delete sna[tid];
-    delete [] sna;
-
-  }
+  delete snaptr;

  if (allocated) {
    memory->destroy(setflag);
@ -161,22 +87,11 @@ PairSNAP::~PairSNAP()

 }

-void PairSNAP::compute(int eflag, int vflag)
-{
-//   if (use_optimized)
-//     compute_optimized(eflag, vflag);
-//   else
-
-// hard-code compute_regular()
- 
-    compute_regular(eflag, vflag);
-}
-
 /* ----------------------------------------------------------------------
   This version is a straightforward implementation
   ---------------------------------------------------------------------- */

-void PairSNAP::compute_regular(int eflag, int vflag)
+void PairSNAP::compute(int eflag, int vflag)
 {
  int i,j,jnum,ninside;
  double delx,dely,delz,evdwl,rsq;
@ -191,7 +106,6 @@ void PairSNAP::compute_regular(int eflag, int vflag)
  int *type = atom->type;
  int nlocal = atom->nlocal;
  int newton_pair = force->newton_pair;
-  class SNA* snaptr = sna[0];

  if (beta_max < list->inum) {
    memory->grow(beta,list->inum,ncoeff,"PairSNAP:beta");
@ -315,573 +229,6 @@ void PairSNAP::compute_regular(int eflag, int vflag)
  if (vflag_fdotr) virial_fdotr_compute();
 }

-inline int PairSNAP::equal(double* x,double* y)
-{
-  double dist2 =
-    (x[0]-y[0])*(x[0]-y[0]) +
-    (x[1]-y[1])*(x[1]-y[1]) +
-    (x[2]-y[2])*(x[2]-y[2]);
-  if (dist2 < 1e-20) return 1;
-  return 0;
-}
-
-inline double PairSNAP::dist2(double* x,double* y)
-{
-  return
-    (x[0]-y[0])*(x[0]-y[0]) +
-    (x[1]-y[1])*(x[1]-y[1]) +
-    (x[2]-y[2])*(x[2]-y[2]);
-}
-
-// return extra communication cutoff
-// extra_cutoff = max(subdomain_length)
-
-double PairSNAP::extra_cutoff()
-{
-  double sublo[3],subhi[3];
-
-  if (domain->triclinic == 0) {
-    for (int dim = 0 ; dim < 3 ; dim++) {
-      sublo[dim] = domain->sublo[dim];
-      subhi[dim] = domain->subhi[dim];
-    }
-  } else {
-    domain->lamda2x(domain->sublo_lamda,sublo);
-    domain->lamda2x(domain->subhi_lamda,subhi);
-  }
-
-  double sub_size[3];
-  for (int dim = 0; dim < 3; dim++)
-    sub_size[dim] = subhi[dim] - sublo[dim];
-
-  double max_sub_size = 0;
-  for (int dim = 0; dim < 3; dim++)
-    max_sub_size = MAX(max_sub_size,sub_size[dim]);
-
-  // note: for triclinic, probably need something different
-  // see Comm::setup()
-
-  return max_sub_size;
-}
-
-// micro load_balancer: each MPI process will
-// check with each of its 26 neighbors,
-// whether an imbalance exists in the number
-// of atoms to calculate forces for.
-// If it does it will set ilistmask of one of
-// its local atoms to zero, and send its Tag
-// to the neighbor process. The neighboring process
-// will check its ghost list for the
-// ghost atom with the same Tag which is closest
-// to its domain center, and build a
-// neighborlist for this ghost atom. For this to work,
-// the communication cutoff has to be
-// as large as the neighbor cutoff +
-// maximum subdomain length.
-
-// Note that at most one atom is exchanged per processor pair.
-
-// Also note that the local atom assignment
-// doesn't change. This load balancer will cause
-// some ghost atoms to have full neighborlists
-// which are unique to PairSNAP.
-// They are not part of the generally accessible neighborlist.
-// At the same time corresponding local atoms on
-// other MPI processes will not be
-// included in the force computation since
-// their ilistmask is 0. This does not effect
-// any other classes which might
-// access the same general neighborlist.
-// Reverse communication (newton on) of forces is required.
-
-// Currently the load balancer does two passes,
-// since its exchanging atoms upstream and downstream.
-
-void PairSNAP::load_balance()
-{
-  double sublo[3],subhi[3];
-  if (domain->triclinic == 0) {
-    double* sublotmp = domain->sublo;
-    double* subhitmp = domain->subhi;
-    for (int dim = 0 ; dim<3 ; dim++) {
-      sublo[dim]=sublotmp[dim];
-      subhi[dim]=subhitmp[dim];
-    }
-  } else {
-    double* sublotmp = domain->sublo_lamda;
-    double* subhitmp = domain->subhi_lamda;
-    domain->lamda2x(sublotmp,sublo);
-    domain->lamda2x(subhitmp,subhi);
-  }
-
-  //if (list->inum==0) list->grow(atom->nmax);
-
-  int nlocal = ghostinum;
-  for (int i=0; i < list->inum; i++)
-    if (ilistmask[i]) nlocal++;
-  int ***grid2proc = comm->grid2proc;
-  int* procgrid = comm->procgrid;
-
-  int nlocal_up,nlocal_down;
-  MPI_Request request;
-
-  double sub_mid[3];
-  for (int dim=0; dim<3; dim++)
-    sub_mid[dim] = (subhi[dim] + sublo[dim])/2;
-
-  if (comm->cutghostuser <
-      neighbor->cutneighmax+extra_cutoff())
-    error->all(FLERR,"Communication cutoff too small for SNAP micro load balancing");
-
-  int nrecv = ghostinum;
-  int totalsend = 0;
-  int nsend = 0;
-  int depth = 1;
-
-  for (int dx = -depth; dx < depth+1; dx++)
-    for (int dy = -depth; dy < depth+1; dy++)
-      for (int dz = -depth; dz < depth+1; dz++) {
-
-        if (dx == dy && dy == dz && dz == 0) continue;
-
-        int sendloc[3] = {comm->myloc[0],
-                          comm->myloc[1], comm->myloc[2]
-                         };
-        sendloc[0] += dx;
-        sendloc[1] += dy;
-        sendloc[2] += dz;
-        for (int dim = 0; dim < 3; dim++)
-          if (sendloc[dim] >= procgrid[dim])
-            sendloc[dim] = sendloc[dim] - procgrid[dim];
-        for (int dim = 0; dim < 3; dim++)
-          if (sendloc[dim] < 0)
-            sendloc[dim] = procgrid[dim] + sendloc[dim];
-        int recvloc[3] = {comm->myloc[0],
-                          comm->myloc[1], comm->myloc[2]
-                         };
-        recvloc[0] -= dx;
-        recvloc[1] -= dy;
-        recvloc[2] -= dz;
-        for (int dim = 0; dim < 3; dim++)
-          if (recvloc[dim] < 0)
-            recvloc[dim] = procgrid[dim] + recvloc[dim];
-        for (int dim = 0; dim < 3; dim++)
-          if (recvloc[dim] >= procgrid[dim])
-            recvloc[dim] = recvloc[dim] - procgrid[dim];
-
-        int sendproc = grid2proc[sendloc[0]][sendloc[1]][sendloc[2]];
-        int recvproc = grid2proc[recvloc[0]][recvloc[1]][recvloc[2]];
-
-        // two stage process, first upstream movement, then downstream
-
-        MPI_Sendrecv(&nlocal,1,MPI_INT,sendproc,0,
-                     &nlocal_up,1,MPI_INT,recvproc,0,world,MPI_STATUS_IGNORE);
-        MPI_Sendrecv(&nlocal,1,MPI_INT,recvproc,0,
-                     &nlocal_down,1,MPI_INT,sendproc,0,world,MPI_STATUS_IGNORE);
-        nsend = 0;
-
-        // send upstream
-
-        if (nlocal > nlocal_up+1) {
-
-          int i = totalsend++;
-          while(i < list->inum && ilistmask[i] == 0)
-            i = totalsend++;
-
-          if (i < list->inum)
-            MPI_Isend(&atom->tag[i],1,MPI_INT,recvproc,0,world,&request);
-          else {
-            int j = -1;
-            MPI_Isend(&j,1,MPI_INT,recvproc,0,world,&request);
-          }
-
-          if (i < list->inum) {
-            for (int j = 0; j < list->inum; j++)
-              if (list->ilist[j] == i)
-                ilistmask[j] = 0;
-            nsend = 1;
-          }
-        }
-
-        // recv downstream
-
-        if (nlocal < nlocal_down-1) {
-          nlocal++;
-          int get_tag = -1;
-          MPI_Recv(&get_tag,1,MPI_INT,sendproc,0,world,MPI_STATUS_IGNORE);
-
-          // if get_tag -1 the other process didnt have local atoms to send
-
-          if (get_tag >= 0) {
-            if (ghostinum >= ghostilist_max) {
-              memory->grow(ghostilist,ghostinum+10,
-                           "PairSnap::ghostilist");
-              ghostilist_max = ghostinum+10;
-            }
-            if (atom->nlocal + atom->nghost >= ghostnumneigh_max) {
-              ghostnumneigh_max = atom->nlocal+atom->nghost+100;
-              memory->grow(ghostnumneigh,ghostnumneigh_max,
-                           "PairSnap::ghostnumneigh");
-              memory->grow(ghostfirstneigh,ghostnumneigh_max,
-                           "PairSnap::ghostfirstneigh");
-            }
-
-            // find closest ghost image of the transfered particle
-
-            double mindist = 1e200;
-            int closestghost = -1;
-            for (int j = 0; j < atom->nlocal + atom->nghost; j++)
-              if (atom->tag[j] == get_tag)
-                if (dist2(sub_mid, atom->x[j]) < mindist) {
-                  closestghost = j;
-                  mindist = dist2(sub_mid, atom->x[j]);
-                }
-
-            // build neighborlist for this particular
-            // ghost atom, and add it to list->ilist
-
-            if (ghostneighs_max - ghostneighs_total <
-                neighbor->oneatom) {
-              memory->grow(ghostneighs,
-                           ghostneighs_total + neighbor->oneatom,
-                           "PairSnap::ghostneighs");
-              ghostneighs_max = ghostneighs_total + neighbor->oneatom;
-            }
-
-            int j = closestghost;
-
-            ghostilist[ghostinum] = j;
-            ghostnumneigh[j] = 0;
-            ghostfirstneigh[j] = ghostneighs_total;
-
-            ghostinum++;
-            int* jlist = ghostneighs + ghostfirstneigh[j];
-
-            // find all neighbors by looping
-            // over all local and ghost atoms
-
-            for (int k = 0; k < atom->nlocal + atom->nghost; k++)
-              if (dist2(atom->x[j],atom->x[k]) <
-                  neighbor->cutneighmax*neighbor->cutneighmax) {
-                jlist[ghostnumneigh[j]] = k;
-                ghostnumneigh[j]++;
-                ghostneighs_total++;
-              }
-          }
-
-          if (get_tag >= 0) nrecv++;
-        }
-
-        // decrease nlocal later, so that it is the
-        // initial number both for receiving and sending
-
-        if (nsend) nlocal--;
-
-        // second pass through the grid
-
-        MPI_Sendrecv(&nlocal,1,MPI_INT,sendproc,0,
-                     &nlocal_up,1,MPI_INT,recvproc,0,world,MPI_STATUS_IGNORE);
-        MPI_Sendrecv(&nlocal,1,MPI_INT,recvproc,0,
-                     &nlocal_down,1,MPI_INT,sendproc,0,world,MPI_STATUS_IGNORE);
-
-        // send downstream
-
-        nsend=0;
-        if (nlocal > nlocal_down+1) {
-          int i = totalsend++;
-          while(i < list->inum && ilistmask[i]==0) i = totalsend++;
-
-          if (i < list->inum)
-            MPI_Isend(&atom->tag[i],1,MPI_INT,sendproc,0,world,&request);
-          else {
-            int j =- 1;
-            MPI_Isend(&j,1,MPI_INT,sendproc,0,world,&request);
-          }
-
-          if (i < list->inum) {
-            for (int j=0; j<list->inum; j++)
-              if (list->ilist[j] == i) ilistmask[j] = 0;
-            nsend = 1;
-          }
-        }
-
-        // receive upstream
-
-        if (nlocal < nlocal_up-1) {
-          nlocal++;
-          int get_tag = -1;
-
-          MPI_Recv(&get_tag,1,MPI_INT,recvproc,0,world,MPI_STATUS_IGNORE);
-
-          if (get_tag >= 0) {
-            if (ghostinum >= ghostilist_max) {
-              memory->grow(ghostilist,ghostinum+10,
-                           "PairSnap::ghostilist");
-              ghostilist_max = ghostinum+10;
-            }
-            if (atom->nlocal + atom->nghost >= ghostnumneigh_max) {
-              ghostnumneigh_max = atom->nlocal + atom->nghost + 100;
-              memory->grow(ghostnumneigh,ghostnumneigh_max,
-                           "PairSnap::ghostnumneigh");
-              memory->grow(ghostfirstneigh,ghostnumneigh_max,
-                           "PairSnap::ghostfirstneigh");
-            }
-
-            // find closest ghost image of the transfered particle
-
-            double mindist = 1e200;
-            int closestghost = -1;
-            for (int j = 0; j < atom->nlocal + atom->nghost; j++)
-              if (atom->tag[j] == get_tag)
-                if (dist2(sub_mid,atom->x[j])<mindist) {
-                  closestghost = j;
-                  mindist = dist2(sub_mid,atom->x[j]);
-                }
-
-            // build neighborlist for this particular ghost atom
-
-            if (ghostneighs_max-ghostneighs_total < neighbor->oneatom) {
-              memory->grow(ghostneighs,ghostneighs_total + neighbor->oneatom,
-                           "PairSnap::ghostneighs");
-              ghostneighs_max = ghostneighs_total + neighbor->oneatom;
-            }
-
-            int j = closestghost;
-
-            ghostilist[ghostinum] = j;
-            ghostnumneigh[j] = 0;
-            ghostfirstneigh[j] = ghostneighs_total;
-
-            ghostinum++;
-            int* jlist = ghostneighs + ghostfirstneigh[j];
-
-            for (int k = 0; k < atom->nlocal + atom->nghost; k++)
-              if (dist2(atom->x[j],atom->x[k]) <
-                  neighbor->cutneighmax*neighbor->cutneighmax) {
-                jlist[ghostnumneigh[j]] = k;
-                ghostnumneigh[j]++;
-                ghostneighs_total++;
-              }
-          }
-
-          if (get_tag >= 0) nrecv++;
-        }
-        if (nsend) nlocal--;
-      }
-}
-
-void PairSNAP::set_sna_to_shared(int snaid,int i)
-{
-  sna[snaid]->rij = i_rij[i];
-  sna[snaid]->inside = i_inside[i];
-  sna[snaid]->wj = i_wj[i];
-  sna[snaid]->rcutij = i_rcutij[i];
-  sna[snaid]->zarray_r = i_zarray_r[i];
-  sna[snaid]->zarray_i = i_zarray_i[i];
-  sna[snaid]->uarraytot_r = i_uarraytot_r[i];
-  sna[snaid]->uarraytot_i = i_uarraytot_i[i];
-}
-
-void PairSNAP::build_per_atom_arrays()
-{
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&starttime);
-#endif
-
-  int count = 0;
-  int neighmax = 0;
-  for (int ii = 0; ii < list->inum; ii++)
-    if ((do_load_balance <= 0) || ilistmask[ii]) {
-      neighmax=MAX(neighmax,list->numneigh[list->ilist[ii]]);
-      ++count;
-    }
-  for (int ii = 0; ii < ghostinum; ii++) {
-    neighmax=MAX(neighmax,ghostnumneigh[ghostilist[ii]]);
-    ++count;
-  }
-
-  if (i_max < count || i_neighmax < neighmax) {
-    int i_maxt = MAX(count,i_max);
-    i_neighmax = MAX(neighmax,i_neighmax);
-    memory->destroy(i_rij);
-    memory->destroy(i_inside);
-    memory->destroy(i_wj);
-    memory->destroy(i_rcutij);
-    memory->destroy(i_ninside);
-    memory->destroy(i_pairs);
-    memory->create(i_rij,i_maxt,i_neighmax,3,"PairSNAP::i_rij");
-    memory->create(i_inside,i_maxt,i_neighmax,"PairSNAP::i_inside");
-    memory->create(i_wj,i_maxt,i_neighmax,"PairSNAP::i_wj");
-    memory->create(i_rcutij,i_maxt,i_neighmax,"PairSNAP::i_rcutij");
-    memory->create(i_ninside,i_maxt,"PairSNAP::i_ninside");
-    memory->create(i_pairs,i_maxt*i_neighmax,4,"PairSNAP::i_pairs");
-  }
-
-  if (i_max < count) {
-    int jdim = sna[0]->twojmax+1;
-    memory->destroy(i_uarraytot_r);
-    memory->destroy(i_uarraytot_i);
-    memory->create(i_uarraytot_r,count,jdim,jdim,jdim,
-                   "PairSNAP::i_uarraytot_r");
-    memory->create(i_uarraytot_i,count,jdim,jdim,jdim,
-                   "PairSNAP::i_uarraytot_i");
-    if (i_zarray_r != NULL)
-      for (int i = 0; i < i_max; i++) {
-        memory->destroy(i_zarray_r[i]);
-        memory->destroy(i_zarray_i[i]);
-      }
-
-    delete [] i_zarray_r;
-    delete [] i_zarray_i;
-    i_zarray_r = new double*****[count];
-    i_zarray_i = new double*****[count];
-    for (int i = 0; i < count; i++) {
-      memory->create(i_zarray_r[i],jdim,jdim,jdim,jdim,jdim,
-                     "PairSNAP::i_zarray_r");
-      memory->create(i_zarray_i[i],jdim,jdim,jdim,jdim,jdim,
-                     "PairSNAP::i_zarray_i");
-    }
-  }
-
-  if (i_max < count)
-    i_max = count;
-
-  count = 0;
-  i_numpairs = 0;
-  for (int ii = 0; ii < list->inum; ii++) {
-    if ((do_load_balance <= 0) || ilistmask[ii]) {
-      int i = list->ilist[ii];
-      int jnum = list->numneigh[i];
-      int* jlist = list->firstneigh[i];
-      const double xtmp = atom->x[i][0];
-      const double ytmp = atom->x[i][1];
-      const double ztmp = atom->x[i][2];
-      const int itype = atom->type[i];
-      const int ielem = map[itype];
-      const double radi = radelem[ielem];
-      int ninside = 0;
-      for (int jj = 0; jj < jnum; jj++) {
-        int j = jlist[jj];
-        j &= NEIGHMASK;
-        const double delx = atom->x[j][0] - xtmp;
-        const double dely = atom->x[j][1] - ytmp;
-        const double delz = atom->x[j][2] - ztmp;
-        const double rsq = delx*delx + dely*dely + delz*delz;
-        int jtype = atom->type[j];
-        int jelem = map[jtype];
-
-        i_pairs[i_numpairs][0] = i;
-        i_pairs[i_numpairs][1] = jj;
-        i_pairs[i_numpairs][2] = -1;
-        i_pairs[i_numpairs][3] = count;
-        if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-          i_rij[count][ninside][0] = delx;
-          i_rij[count][ninside][1] = dely;
-          i_rij[count][ninside][2] = delz;
-          i_inside[count][ninside] = j;
-          i_wj[count][ninside] = wjelem[jelem];
-          i_rcutij[count][ninside] = (radi + radelem[jelem])*rcutfac;
-
-          // update index list with inside index
-          i_pairs[i_numpairs][2] = ninside++;
-        }
-        i_numpairs++;
-      }
-      i_ninside[count] = ninside;
-      count++;
-    }
-  }
-
-  for (int ii = 0; ii < ghostinum; ii++) {
-    int i = ghostilist[ii];
-    int jnum = ghostnumneigh[i];
-    int* jlist = ghostneighs+ghostfirstneigh[i];
-    const double xtmp = atom->x[i][0];
-    const double ytmp = atom->x[i][1];
-    const double ztmp = atom->x[i][2];
-    const int itype = atom->type[i];
-    const int ielem = map[itype];
-    const double radi = radelem[ielem];
-    int ninside = 0;
-
-    for (int jj = 0; jj < jnum; jj++) {
-      int j = jlist[jj];
-      j &= NEIGHMASK;
-      const double delx = atom->x[j][0] - xtmp;
-      const double dely = atom->x[j][1] - ytmp;
-      const double delz = atom->x[j][2] - ztmp;
-      const double rsq = delx*delx + dely*dely + delz*delz;
-      int jtype = atom->type[j];
-      int jelem = map[jtype];
-
-      i_pairs[i_numpairs][0] = i;
-      i_pairs[i_numpairs][1] = jj;
-      i_pairs[i_numpairs][2] = -1;
-      i_pairs[i_numpairs][3] = count;
-      if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-        i_rij[count][ninside][0] = delx;
-        i_rij[count][ninside][1] = dely;
-        i_rij[count][ninside][2] = delz;
-        i_inside[count][ninside] = j;
-        i_wj[count][ninside] = wjelem[jelem];
-        i_rcutij[count][ninside] = (radi + radelem[jelem])*rcutfac;
-        // update index list with inside index
-        i_pairs[i_numpairs][2] = ninside++;
-      }
-      i_numpairs++;
-    }
-    i_ninside[count] = ninside;
-    count++;
-  }
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&endtime);
-  timers[0]+=(endtime.tv_sec-starttime.tv_sec+1.0*
-              (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-#endif
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&starttime);
-#endif
-
-#if defined(_OPENMP)
-#pragma omp parallel for shared(count) default(none)
-#endif
-  for (int ii=0; ii < count; ii++) {
-    int tid = omp_get_thread_num();
-    set_sna_to_shared(tid,ii);
-    //sna[tid]->compute_ui(i_ninside[ii]);
-#ifdef TIMING_INFO
-    clock_gettime(CLOCK_REALTIME,&starttime);
-#endif
-    sna[tid]->compute_ui_omp(i_ninside[ii],MAX(int(nthreads/count),1));
-#ifdef TIMING_INFO
-    clock_gettime(CLOCK_REALTIME,&endtime);
-    sna[tid]->timers[0]+=(endtime.tv_sec-starttime.tv_sec+1.0*
-                          (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-#endif
-  }
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&starttime);
-#endif
-  for (int ii=0; ii < count; ii++) {
-    int tid = 0;//omp_get_thread_num();
-    set_sna_to_shared(tid,ii);
-    sna[tid]->compute_zi_omp(MAX(int(nthreads/count),1));
-  }
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&endtime);
-  sna[0]->timers[1]+=(endtime.tv_sec-starttime.tv_sec+1.0*
-                      (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-#endif
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&endtime);
-  timers[1]+=(endtime.tv_sec-starttime.tv_sec+1.0*
-              (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-#endif
-}
-
 /* ----------------------------------------------------------------------
   compute beta
 ------------------------------------------------------------------------- */
@ -914,7 +261,6 @@ void PairSNAP::compute_bispectrum()

  double **x = atom->x;
  int *type = atom->type;
-  class SNA* snaptr = sna[0];

  for (int ii = 0; ii < list->inum; ii++) {
    i = list->ilist[ii];
@ -991,96 +337,8 @@ void PairSNAP::allocate()

 void PairSNAP::settings(int narg, char **arg)
 {
-
-  // set default values for optional arguments
-
-  nthreads = -1;
-  use_shared_arrays=-1;
-  do_load_balance = 0;
-  use_optimized = 1;
-
-  // optional arguments
-
-  for (int i=0; i < narg; i++) {
-    if (i+2>narg) error->all(FLERR,"Illegal pair_style command");
-    if (strcmp(arg[i],"nthreads")==0) {
-      nthreads=force->inumeric(FLERR,arg[++i]);
-#if defined(LMP_USER_OMP)
-      error->all(FLERR,"Must set number of threads via package omp command");
-#else
-      omp_set_num_threads(nthreads);
-      comm->nthreads=nthreads;
-#endif
-      continue;
-    }
-    if (strcmp(arg[i],"optimized")==0) {
-      use_optimized=force->inumeric(FLERR,arg[++i]);
-      continue;
-    }
-    if (strcmp(arg[i],"shared")==0) {
-      use_shared_arrays=force->inumeric(FLERR,arg[++i]);
-      continue;
-    }
-    if (strcmp(arg[i],"loadbalance")==0) {
-      do_load_balance = force->inumeric(FLERR,arg[++i]);
-      if (do_load_balance) {
-        double mincutoff = extra_cutoff() +
-          rcutmax + neighbor->skin;
-        if (comm->cutghostuser < mincutoff) {
-          char buffer[255];
-
-          //apparently mincutoff is 0 after sprintf command ?????
-
-          double tmp = mincutoff + 0.1;
-          sprintf(buffer, "Communication cutoff is too small "
-                  "for SNAP micro load balancing, increased to %lf",
-                  mincutoff+0.1);
-          if (comm->me==0)
-            error->warning(FLERR,buffer);
-
-          comm->cutghostuser = tmp;
-
-        }
-      }
-      continue;
-    }
-    if (strcmp(arg[i],"schedule")==0) {
-      i++;
-      if (strcmp(arg[i],"static")==0)
-        schedule_user = 1;
-      if (strcmp(arg[i],"dynamic")==0)
-        schedule_user = 2;
-      if (strcmp(arg[i],"guided")==0)
-        schedule_user = 3;
-      if (strcmp(arg[i],"auto")==0)
-        schedule_user = 4;
-      if (strcmp(arg[i],"determine")==0)
-        schedule_user = 5;
-      if (schedule_user == 0)
-        error->all(FLERR,"Illegal pair_style command");
-      continue;
-    }
+  for (int i=0; i < narg; i++)
    error->all(FLERR,"Illegal pair_style command");
-  }
-
-  if (nthreads < 0)
-    nthreads = comm->nthreads;
-
-  if (use_shared_arrays < 0) {
-    if (nthreads > 1 && atom->nlocal <= 2*nthreads)
-      use_shared_arrays = 1;
-    else use_shared_arrays = 0;
-  }
-
-  // check if running non-optimized code with
-  // optimization flags set
-
-  if (!use_optimized)
-    if (nthreads > 1 ||
-        use_shared_arrays ||
-        do_load_balance ||
-        schedule_user)
-      error->all(FLERR,"Illegal pair_style command");
 }

 /* ----------------------------------------------------------------------
@ -1170,26 +428,14 @@ void PairSNAP::coeff(int narg, char **arg)

  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");

-  sna = new SNA*[nthreads];
+  snaptr = new SNA(lmp,rfac0,twojmax,
+                   diagonalstyle,
+                   rmin0,switchflag,bzeroflag);
+  snaptr->grow_rij(nmax);

-  // allocate memory for per OpenMP thread data which
-  // is wrapped into the sna class
-
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    sna[tid] = new SNA(lmp,rfac0,twojmax,
-                       diagonalstyle,use_shared_arrays,
-                       rmin0,switchflag,bzeroflag);
-    if (!use_shared_arrays)
-      sna[tid]->grow_rij(nmax);
-  }
-
-  if (ncoeff != sna[0]->ncoeff) {
+  if (ncoeff != snaptr->ncoeff) {
    if (comm->me == 0)
-      printf("ncoeff = %d snancoeff = %d \n",ncoeff,sna[0]->ncoeff);
+      printf("ncoeff = %d snancoeff = %d \n",ncoeff,snaptr->ncoeff);
    error->all(FLERR,"Incorrect SNAP parameter file");
  }

@ -1216,13 +462,7 @@ void PairSNAP::init_style()
  neighbor->requests[irequest]->half = 0;
  neighbor->requests[irequest]->full = 1;

-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    sna[tid]->init();
-  }
+  snaptr->init();

 }

@ -1370,6 +610,8 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename)
    }
  }

+  if (comm->me == 0) fclose(fpcoeff);
+
  // set flags for required keywords

  rcutfacflag = 0;
@ -1471,7 +713,6 @@ double PairSNAP::memory_usage()
  bytes += nmax*sizeof(int);
  bytes += (2*ncoeffall)*sizeof(double);
  bytes += (ncoeff*3)*sizeof(double);
-  bytes += sna[0]->memory_usage()*nthreads;
  return bytes;
 }

--- a/src/SNAP/pair_snap.h
+++ b/src/SNAP/pair_snap.h
@ -29,8 +29,6 @@ public:
  PairSNAP(class LAMMPS *);
  ~PairSNAP();
  virtual void compute(int, int);
-  void compute_regular(int, int);
-  void compute_optimized(int, int);
  void settings(int, char **);
  virtual void coeff(int, char **);
  virtual void init_style();
@ -43,59 +41,16 @@ public:
 protected:
  int ncoeffq, ncoeffall;
  double **bvec, ***dbvec;
-  class SNA** sna;
+  class SNA* snaptr;
  int nmax;
-  int nthreads;
  virtual void allocate();
  void read_files(char *, char *);
  inline int equal(double* x,double* y);
  inline double dist2(double* x,double* y);
-  double extra_cutoff();
-  void load_balance();
-  void set_sna_to_shared(int snaid,int i);
-  void build_per_atom_arrays();

  void compute_beta();
  void compute_bispectrum();

-  int schedule_user;
-  double schedule_time_guided;
-  double schedule_time_dynamic;
-
-  int ncalls_neigh;
-  int do_load_balance;
-  int ilistmask_max;
-  int* ilistmask;
-  int ghostinum;
-  int ghostilist_max;
-  int* ghostilist;
-  int ghostnumneigh_max;
-  int* ghostnumneigh;
-  int* ghostneighs;
-  int* ghostfirstneigh;
-  int ghostneighs_total;
-  int ghostneighs_max;
-
-  int use_optimized;
-  int use_shared_arrays;
-
-  int i_max;
-  int i_neighmax;
-  int i_numpairs;
-  int **i_pairs;
-  double ***i_rij;
-  int **i_inside;
-  double **i_wj;
-  double **i_rcutij;
-  int *i_ninside;
-  double ****i_uarraytot_r, ****i_uarraytot_i;
-  double ******i_zarray_r, ******i_zarray_i;
-
-#ifdef TIMING_INFO
-  //  timespec starttime, endtime;
-  double timers[4];
-#endif
-
  double rcutmax;               // max cutoff for all elements
  int nelements;                // # of unique elements
  char **elements;              // names of unique elements
@ -130,15 +85,6 @@ Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.

-E: Must set number of threads via package omp command
-
-Because you are using the USER-OMP package, set the number of threads
-via its settings, not by the pair_style snap nthreads setting.
-
-W: Communication cutoff is too small for SNAP micro load balancing, increased to %lf
-
-Self-explanatory.
-
 E: Incorrect args for pair coefficients

 Self-explanatory.  Check the input script or data file.
--- a/src/SNAP/sna.cpp
+++ b/src/SNAP/sna.cpp
@ -21,7 +21,6 @@
 #include "math_extra.h"
 #include <cstring>
 #include <cstdlib>
-#include "openmp_snap.h"

 #include "memory.h"
 #include "error.h"
@ -114,12 +113,11 @@ using namespace MathConst;
 ------------------------------------------------------------------------- */

 SNA::SNA(LAMMPS* lmp, double rfac0_in,
-         int twojmax_in, int diagonalstyle_in, int use_shared_arrays_in,
+         int twojmax_in, int diagonalstyle_in,
         double rmin0_in, int switch_flag_in, int bzero_flag_in) : Pointers(lmp)
 {
  wself = 1.0;

-  use_shared_arrays = use_shared_arrays_in;
  rfac0 = rfac0_in;
  rmin0 = rmin0_in;
  switch_flag = switch_flag_in;
@ -141,7 +139,8 @@ SNA::SNA(LAMMPS* lmp, double rfac0_in,
  wj = NULL;
  rcutij = NULL;
  nmax = 0;
-  idxj = NULL;
+  idxz = NULL;
+  idxb= NULL;

  if (bzero_flag) {
    double www = wself*wself*wself;
@ -149,133 +148,178 @@ SNA::SNA(LAMMPS* lmp, double rfac0_in,
      bzero[j] = www*(j+1);
  }

-#ifdef TIMING_INFO
-  timers = new double[20];
-  for(int i = 0; i < 20; i++) timers[i] = 0;
-  print = 0;
-  counter = 0;
-#endif
-
  build_indexlist();
-
-
 }

 /* ---------------------------------------------------------------------- */

 SNA::~SNA()
 {
-  if(!use_shared_arrays) {
-    destroy_twojmax_arrays();
-    memory->destroy(rij);
-    memory->destroy(inside);
-    memory->destroy(wj);
-    memory->destroy(rcutij);
-    memory->destroy(bvec);
-    memory->destroy(dbvec);
-  }
-  delete[] idxj;
+  destroy_twojmax_arrays();
+  memory->destroy(rij);
+  memory->destroy(inside);
+  memory->destroy(wj);
+  memory->destroy(rcutij);
+  memory->destroy(bvec);
+  memory->destroy(dbvec);
+  delete[] idxz;
+  delete[] idxb;
 }

 void SNA::build_indexlist()
 {
-  if(diagonalstyle == 0) {
-    int idxj_count = 0;
+  if(diagonalstyle != 3)
+    error->all(FLERR, "diagonal_style must be 3\n");

-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
-          idxj_count++;
+  // index list for cglist

-    // indexList can be changed here
+  int jdim = twojmax + 1;
+  memory->create(idxcg_block, jdim, jdim, jdim,
+                 "sna:idxcg_block");

-    idxj = new SNA_LOOPINDICES[idxj_count];
-    idxj_max = idxj_count;
+  int idxcg_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+        idxcg_block[j1][j2][j] = idxcg_count; 
+        for (int m1 = 0; m1 <= j1; m1++)
+          for (int m2 = 0; m2 <= j2; m2++)
+            idxcg_count++;
+      }
+  idxcg_max = idxcg_count;

-    idxj_count = 0;
+  // index list for uarray
+  // need to include both halves

-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
-          idxj[idxj_count].j1 = j1;
-          idxj[idxj_count].j2 = j2;
-          idxj[idxj_count].j = j;
-          idxj_count++;
+  memory->create(idxu_block, jdim,
+                 "sna:idxu_block");
+
+  int idxu_count = 0;
+  
+  for(int j = 0; j <= twojmax; j++) {
+    idxu_block[j] = idxu_count; 
+    for(int mb = 0; mb <= j; mb++)
+      for(int ma = 0; ma <= j; ma++)
+        idxu_count++;
+  }
+  idxu_max = idxu_count;
+
+  // index list for beta and B
+
+  int idxb_count = 0;  
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
+        if (j >= j1) idxb_count++;
+  
+  idxb_max = idxb_count;
+  idxb = new SNA_BINDICES[idxb_max];
+  
+  idxb_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
+        if (j >= j1) {
+          idxb[idxb_count].j1 = j1;
+          idxb[idxb_count].j2 = j2;
+          idxb[idxb_count].j = j;
+          idxb_count++;
        }
-  }

-  if(diagonalstyle == 1) {
-    int idxj_count = 0;
+  // reverse index list for beta and b

-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j = 0; j <= MIN(twojmax, 2 * j1); j += 2) {
-        idxj_count++;
+  memory->create(idxb_block, jdim, jdim, jdim,
+                 "sna:idxb_block");
+  idxb_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+        if (j < j1) continue;
+        idxb_block[j1][j2][j] = idxb_count; 
+        idxb_count++;
      }

-    // indexList can be changed here
+  // index list for zlist

-    idxj = new SNA_LOOPINDICES[idxj_count];
-    idxj_max = idxj_count;
+  int idxz_count = 0;

-    idxj_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
+        for (int mb = 0; 2*mb <= j; mb++)
+          for (int ma = 0; ma <= j; ma++)
+            idxz_count++;
+  
+  idxz_max = idxz_count;
+  idxz = new SNA_ZINDICES[idxz_max];
+  
+  memory->create(idxz_block, jdim, jdim, jdim,
+                 "sna:idxz_block");
+  
+  idxz_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+        idxz_block[j1][j2][j] = idxz_count; 

-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j = 0; j <= MIN(twojmax, 2 * j1); j += 2) {
-        idxj[idxj_count].j1 = j1;
-        idxj[idxj_count].j2 = j1;
-        idxj[idxj_count].j = j;
-        idxj_count++;
-      }
-  }
+        // find right beta[jjb] entry
+        // multiply and divide by j+1 factors
+        // account for multiplicity of 1, 2, or 3

-  if(diagonalstyle == 2) {
-    int idxj_count = 0;
+        // CODE HORROR!! Need to figure this out later
+        double betaj = 1.0;
+// #ifdef USE_YDIRECT_ZLIST
+//         double betaj; 
+//         if (j >= j1) {
+//           const int jjb = idxb_block[j1][j2][j];
+//           if (j1 == j) {
+//             if (j2 == j) betaj = 3*beta[jjb];
+//             else betaj = 2*beta[jjb];
+//           } else betaj = beta[jjb]; 
+//         } else if (j >= j2) {
+//           const int jjb = idxb_block[j][j2][j1];
+//           if (j2 == j) betaj = 2*beta[jjb]*(j1+1)/(j+1.0);
+//           else betaj = beta[jjb]*(j1+1)/(j+1.0);
+//         } else {
+//           const int jjb = idxb_block[j2][j][j1];
+//           betaj = beta[jjb]*(j1+1)/(j+1.0); 
+//         }
+// #else
+//         double betaj; 
+//         if (j >= j1) {
+//           const int jjb = idxb_block[j1][j2][j];
+//           betaj = beta[jjb]; 
+//         } else if (j >= j2) {
+//           const int jjb = idxb_block[j][j2][j1];
+//           betaj = beta[jjb]*(j1+1)/(j+1.0);
+//         } else {
+//           const int jjb = idxb_block[j2][j][j1];
+//           betaj = beta[jjb]*(j1+1)/(j+1.0); 
+//         }
+// #endif

-    for(int j1 = 0; j1 <= twojmax; j1++) {
-      idxj_count++;
-    }
+        for (int mb = 0; 2*mb <= j; mb++)
+          for (int ma = 0; ma <= j; ma++) {
+            idxz[idxz_count].j1 = j1;
+            idxz[idxz_count].j2 = j2;
+            idxz[idxz_count].j = j;
+            idxz[idxz_count].ma1min = MAX(0, (2 * ma - j - j2 + j1) / 2);
+            idxz[idxz_count].ma2max = (2 * ma - j - (2 * idxz[idxz_count].ma1min - j1) + j2) / 2;
+            idxz[idxz_count].na = MIN(j1, (2 * ma - j + j2 + j1) / 2) - idxz[idxz_count].ma1min + 1;
+            idxz[idxz_count].mb1min = MAX(0, (2 * mb - j - j2 + j1) / 2);
+            idxz[idxz_count].mb2max = (2 * mb - j - (2 * idxz[idxz_count].mb1min - j1) + j2) / 2;
+            idxz[idxz_count].nb = MIN(j1, (2 * mb - j + j2 + j1) / 2) - idxz[idxz_count].mb1min + 1;

-    // indexList can be changed here
+            // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
+            // find right beta[jjb] and y_list[jju] entries

-    idxj = new SNA_LOOPINDICES[idxj_count];
-    idxj_max = idxj_count;
+            const int jju = idxu_block[j] + (j+1)*mb + ma;
+            idxz[idxz_count].jju = jju;
+            idxz[idxz_count].betaj = betaj;

-    idxj_count = 0;
-
-    for(int j1 = 0; j1 <= twojmax; j1++) {
-      idxj[idxj_count].j1 = j1;
-      idxj[idxj_count].j2 = j1;
-      idxj[idxj_count].j = j1;
-      idxj_count++;
-    }
-  }
-
-  if(diagonalstyle == 3) {
-    int idxj_count = 0;
-
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
-          if (j >= j1) idxj_count++;
-
-    // indexList can be changed here
-
-    idxj = new SNA_LOOPINDICES[idxj_count];
-    idxj_max = idxj_count;
-
-    idxj_count = 0;
-
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
-          if (j >= j1) {
-            idxj[idxj_count].j1 = j1;
-            idxj[idxj_count].j2 = j2;
-            idxj[idxj_count].j = j;
-            idxj_count++;
+            idxz_count++;
          }
-  }
-
+      }
 }
 /* ---------------------------------------------------------------------- */

@ -292,16 +336,14 @@ void SNA::grow_rij(int newnmax)

  nmax = newnmax;

-  if(!use_shared_arrays) {
-    memory->destroy(rij);
-    memory->destroy(inside);
-    memory->destroy(wj);
-    memory->destroy(rcutij);
-    memory->create(rij, nmax, 3, "pair:rij");
-    memory->create(inside, nmax, "pair:inside");
-    memory->create(wj, nmax, "pair:wj");
-    memory->create(rcutij, nmax, "pair:rcutij");
- }
+  memory->destroy(rij);
+  memory->destroy(inside);
+  memory->destroy(wj);
+  memory->destroy(rcutij);
+  memory->create(rij, nmax, 3, "pair:rij");
+  memory->create(inside, nmax, "pair:inside");
+  memory->create(wj, nmax, "pair:wj");
+  memory->create(rcutij, nmax, "pair:rcutij");
 }
 /* ----------------------------------------------------------------------
   compute Ui by summing over neighbors j
@ -320,10 +362,6 @@ void SNA::compute_ui(int jnum)
  zero_uarraytot();
  addself_uarraytot(wself);

-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
-
  for(int j = 0; j < jnum; j++) {
    x = rij[j][0];
    y = rij[j][1];
@ -339,48 +377,6 @@ void SNA::compute_ui(int jnum)
    add_uarraytot(r, wj[j], rcutij[j]);
  }

-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[0] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
-
-}
-
-void SNA::compute_ui_omp(int jnum, int sub_threads)
-{
-  double rsq, r, x, y, z, z0, theta0;
-
-  // utot(j,ma,mb) = 0 for all j,ma,ma
-  // utot(j,ma,ma) = 1 for all j,ma
-  // for j in neighbors of i:
-  //   compute r0 = (x,y,z,z0)
-  //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
-
-  zero_uarraytot();
-  addself_uarraytot(wself);
-
-  for(int j = 0; j < jnum; j++) {
-    x = rij[j][0];
-    y = rij[j][1];
-    z = rij[j][2];
-    rsq = x * x + y * y + z * z;
-    r = sqrt(rsq);
-    theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij[j] - rmin0);
-    //    theta0 = (r - rmin0) * rscale0;
-    z0 = r / tan(theta0);
-    omp_set_num_threads(sub_threads);
-
-#if defined(_OPENMP)
-#pragma omp parallel shared(x,y,z,z0,r,sub_threads) default(none)
-#endif
-    {
-      compute_uarray_omp(x, y, z, z0, r, sub_threads);
-    }
-    add_uarraytot(r, wj[j], rcutij[j]);
-  }
-
-
 }

 /* ----------------------------------------------------------------------
@ -389,24 +385,6 @@ void SNA::compute_ui_omp(int jnum, int sub_threads)

 void SNA::compute_zi()
 {
-  // for j1 = 0,...,twojmax
-  //   for j2 = 0,twojmax
-  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
-  //        for ma = 0,...,j
-  //          for mb = 0,...,jmid
-  //            z(j1,j2,j,ma,mb) = 0
-  //            for ma1 = Max(0,ma+(j1-j2-j)/2),Min(j1,ma+(j1+j2-j)/2)
-  //              sumb1 = 0
-  //              ma2 = ma-ma1+(j1+j2-j)/2;
-  //              for mb1 = Max(0,mb+(j1-j2-j)/2),Min(j1,mb+(j1+j2-j)/2)
-  //                mb2 = mb-mb1+(j1+j2-j)/2;
-  //                sumb1 += cg(j1,mb1,j2,mb2,j) *
-  //                  u(j1,ma1,mb1) * u(j2,ma2,mb2)
-  //              z(j1,j2,j,ma,mb) += sumb1*cg(j1,ma1,j2,ma2,j)
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif

  // compute_dbidrj() requires full j1/j2/j chunk of z elements
  // use zarray j1/j2 symmetry
@ -449,84 +427,13 @@ void SNA::compute_zi()
      } // end loop over j
    } // end loop over j1, j2

-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[1] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
-}
-
-void SNA::compute_zi_omp(int sub_threads)
-{
-  // for j1 = 0,...,twojmax
-  //   for j2 = 0,twojmax
-  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
-  //        for ma = 0,...,j
-  //          for mb = 0,...,j
-  //            z(j1,j2,j,ma,mb) = 0
-  //            for ma1 = Max(0,ma+(j1-j2-j)/2),Min(j1,ma+(j1+j2-j)/2)
-  //              sumb1 = 0
-  //              ma2 = ma-ma1+(j1+j2-j)/2;
-  //              for mb1 = Max(0,mb+(j1-j2-j)/2),Min(j1,mb+(j1+j2-j)/2)
-  //                mb2 = mb-mb1+(j1+j2-j)/2;
-  //                sumb1 += cg(j1,mb1,j2,mb2,j) *
-  //                  u(j1,ma1,mb1) * u(j2,ma2,mb2)
-  //              z(j1,j2,j,ma,mb) += sumb1*cg(j1,ma1,j2,ma2,j)
-
-  if(omp_in_parallel())
-    omp_set_num_threads(sub_threads);
-
-  // compute_dbidrj() requires full j1/j2/j chunk of z elements
-  // use zarray j1/j2 symmetry
-
-#if defined(_OPENMP)
-#pragma omp parallel for schedule(auto) default(none)
-#endif
-  for(int j1 = 0; j1 <= twojmax; j1++)
-    for(int j2 = 0; j2 <= j1; j2++)
-      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
-
-    double sumb1_r, sumb1_i;
-    int ma2, mb2;
-
-    for(int ma = 0; ma <= j; ma++)
-      for(int mb = 0; mb <= j; mb++) {
-        zarray_r[j1][j2][j][ma][mb] = 0.0;
-        zarray_i[j1][j2][j][ma][mb] = 0.0;
-
-        for(int ma1 = MAX(0, (2 * ma - j - j2 + j1) / 2);
-            ma1 <= MIN(j1, (2 * ma - j + j2 + j1) / 2); ma1++) {
-          sumb1_r = 0.0;
-          sumb1_i = 0.0;
-
-          ma2 = (2 * ma - j - (2 * ma1 - j1) + j2) / 2;
-
-          for(int mb1 = MAX(0, (2 * mb - j - j2 + j1) / 2);
-              mb1 <= MIN(j1, (2 * mb - j + j2 + j1) / 2); mb1++) {
-
-            mb2 = (2 * mb - j - (2 * mb1 - j1) + j2) / 2;
-            sumb1_r += cgarray[j1][j2][j][mb1][mb2] *
-              (uarraytot_r[j1][ma1][mb1] * uarraytot_r[j2][ma2][mb2] -
-               uarraytot_i[j1][ma1][mb1] * uarraytot_i[j2][ma2][mb2]);
-            sumb1_i += cgarray[j1][j2][j][mb1][mb2] *
-              (uarraytot_r[j1][ma1][mb1] * uarraytot_i[j2][ma2][mb2] +
-               uarraytot_i[j1][ma1][mb1] * uarraytot_r[j2][ma2][mb2]);
-          }
-
-          zarray_r[j1][j2][j][ma][mb] +=
-            sumb1_r * cgarray[j1][j2][j][ma1][ma2];
-          zarray_i[j1][j2][j][ma][mb] +=
-            sumb1_i * cgarray[j1][j2][j][ma1][ma2];
-        }
-      }
-  }
 }

 /* ----------------------------------------------------------------------
   compute Yi by summing over products of beta and Zi
 ------------------------------------------------------------------------- */

-void SNA::compute_yi(double* beta)
+void SNA::compute_yi(const double* beta)
 {
  int j;
  int idxz_count;
@ -540,18 +447,18 @@ void SNA::compute_yi(double* beta)
      } // end loop over ma, mb
  } // end loop over j

-  for(int JJ = 0; JJ < idxj_max; JJ++) {
-    const int j1 = idxj[JJ].j1;
-    const int j2 = idxj[JJ].j2;
-    const int j3 = idxj[JJ].j;
+  for(int jjb = 0; jjb < idxb_max; jjb++) {
+    const int j1 = idxb[jjb].j1;
+    const int j2 = idxb[jjb].j2;
+    const int j3 = idxb[jjb].j;

    j = j3;
    jjjzarray_r = zarray_r[j1][j2][j3];
    jjjzarray_i = zarray_i[j1][j2][j3];
    for(int mb = 0; 2*mb <= j; mb++)
      for(int ma = 0; ma <= j; ma++) {
-        yarray_r[j][ma][mb] += beta[JJ]*jjjzarray_r[ma][mb];
-        yarray_i[j][ma][mb] += beta[JJ]*jjjzarray_i[ma][mb];
+        yarray_r[j][ma][mb] += beta[jjb]*jjjzarray_r[ma][mb];
+        yarray_i[j][ma][mb] += beta[jjb]*jjjzarray_i[ma][mb];
      } // end loop over ma, mb

    j = j1;
@ -560,8 +467,8 @@ void SNA::compute_yi(double* beta)
    double j1fac = (j3+1)/(j+1.0);
    for(int mb = 0; 2*mb <= j; mb++)
      for(int ma = 0; ma <= j; ma++) {
-        yarray_r[j][ma][mb] += beta[JJ]*jjjzarray_r[ma][mb]*j1fac;
-        yarray_i[j][ma][mb] += beta[JJ]*jjjzarray_i[ma][mb]*j1fac;
+        yarray_r[j][ma][mb] += beta[jjb]*jjjzarray_r[ma][mb]*j1fac;
+        yarray_i[j][ma][mb] += beta[jjb]*jjjzarray_i[ma][mb]*j1fac;
      } // end loop over ma, mb

    j = j2;
@ -570,8 +477,8 @@ void SNA::compute_yi(double* beta)
    double j2fac = (j3+1)/(j+1.0);
    for(int mb = 0; 2*mb <= j; mb++)
      for(int ma = 0; ma <= j; ma++) {
-        yarray_r[j][ma][mb] += beta[JJ]*jjjzarray_r[ma][mb]*j2fac;
-        yarray_i[j][ma][mb] += beta[JJ]*jjjzarray_i[ma][mb]*j2fac;
+        yarray_r[j][ma][mb] += beta[jjb]*jjjzarray_r[ma][mb]*j2fac;
+        yarray_i[j][ma][mb] += beta[jjb]*jjjzarray_i[ma][mb]*j2fac;
      } // end loop over ma, mb

  } // end loop over jjb
@ -655,10 +562,6 @@ void SNA::compute_bi()
  //            b(j1,j2,j) +=
  //              2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb)

-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
-
  for(int j1 = 0; j1 <= twojmax; j1++)
    for(int j2 = 0; j2 <= j1; j2++) {
      for(int j = abs(j1 - j2);
@ -691,12 +594,6 @@ void SNA::compute_bi()
      }
    }

-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[2] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
-
 }

 /* ----------------------------------------------------------------------
@ -760,164 +657,8 @@ void SNA::compute_duidrj(double* rij, double wj, double rcut)
  z0 = r * cs / sn;
  dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;

-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
-
  compute_duarray(x, y, z, z0, r, dz0dr, wj, rcut);

-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[3] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
-
-}
-
-/* ----------------------------------------------------------------------
-   calculate derivative of Bi w.r.t. atom j
-   variant using indexlist for j1,j2,j
-   variant not using symmetry relation
------------------------------------------------------------------------- */
-
-void SNA::compute_dbidrj_nonsymm()
-{
-  // for j1 = 0,...,twojmax
-  //   for j2 = 0,twojmax
-  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
-  //        dbdr(j1,j2,j) = 0
-  //        for ma = 0,...,j
-  //          for mb = 0,...,j
-  //            dzdr = 0
-  //            for ma1 = Max(0,ma+(j1-j2-j)/2),Min(j1,ma+(j1+j2-j)/2)
-  //              sumb1 = 0
-  //              ma2 = ma-ma1+(j1+j2-j)/2;
-  //              for mb1 = Max(0,mb+(j1-j2-j)/2),Min(j1,mb+(j1+j2-j)/2)
-  //                mb2 = mb-mb1+(j1+j2-j)/2;
-  //                sumb1 += cg(j1,mb1,j2,mb2,j) *
-  //                  (dudr(j1,ma1,mb1) * u(j2,ma2,mb2) +
-  //                  u(j1,ma1,mb1) * dudr(j2,ma2,mb2))
-  //              dzdr += sumb1*cg(j1,ma1,j2,ma2,j)
-  //            dbdr(j1,j2,j) +=
-  //              Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb) +
-  //              Conj(u(j,ma,mb))*dzdr
-
-  double* dbdr;
-  double* dudr_r, *dudr_i;
-  double sumb1_r[3], sumb1_i[3], dzdr_r[3], dzdr_i[3];
-  int ma2;
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
-
-  for(int JJ = 0; JJ < idxj_max; JJ++) {
-    const int j1 = idxj[JJ].j1;
-    const int j2 = idxj[JJ].j2;
-    const int j = idxj[JJ].j;
-
-    dbdr = dbarray[j1][j2][j];
-    dbdr[0] = 0.0;
-    dbdr[1] = 0.0;
-    dbdr[2] = 0.0;
-
-    double** *j1duarray_r = duarray_r[j1];
-    double** *j2duarray_r = duarray_r[j2];
-    double** *j1duarray_i = duarray_i[j1];
-    double** *j2duarray_i = duarray_i[j2];
-    double** j1uarraytot_r = uarraytot_r[j1];
-    double** j2uarraytot_r = uarraytot_r[j2];
-    double** j1uarraytot_i = uarraytot_i[j1];
-    double** j2uarraytot_i = uarraytot_i[j2];
-    double** j1j2jcgarray = cgarray[j1][j2][j];
-
-    for(int ma = 0; ma <= j; ma++)
-      for(int mb = 0; mb <= j; mb++) {
-        dzdr_r[0] = 0.0;
-        dzdr_r[1] = 0.0;
-        dzdr_r[2] = 0.0;
-        dzdr_i[0] = 0.0;
-        dzdr_i[1] = 0.0;
-        dzdr_i[2] = 0.0;
-
-        const int max_mb1 = MIN(j1, (2 * mb - j + j2 + j1) / 2) + 1;
-        const int max_ma1 = MIN(j1, (2 * ma - j + j2 + j1) / 2) + 1;
-
-        for(int ma1 = MAX(0, (2 * ma - j - j2 + j1) / 2);
-            ma1 < max_ma1; ma1++) {
-
-          ma2 = (2 * ma - j - (2 * ma1 - j1) + j2) / 2;
-          sumb1_r[0] = 0.0;
-          sumb1_r[1] = 0.0;
-          sumb1_r[2] = 0.0;
-          sumb1_i[0] = 0.0;
-          sumb1_i[1] = 0.0;
-          sumb1_i[2] = 0.0;
-
-          //inside loop 54 operations (mul and add)
-          for(int mb1 = MAX(0, (2 * mb - j - j2 + j1) / 2),
-              mb2 = mb + (j1 + j2 - j) / 2 - mb1;
-              mb1 < max_mb1; mb1++, mb2--) {
-
-            double* dudr1_r, *dudr1_i, *dudr2_r, *dudr2_i;
-
-            dudr1_r = j1duarray_r[ma1][mb1];
-            dudr2_r = j2duarray_r[ma2][mb2];
-            dudr1_i = j1duarray_i[ma1][mb1];
-            dudr2_i = j2duarray_i[ma2][mb2];
-
-            const double cga_mb1mb2 = j1j2jcgarray[mb1][mb2];
-            const double uat_r_ma2mb2 = cga_mb1mb2 * j2uarraytot_r[ma2][mb2];
-            const double uat_r_ma1mb1 = cga_mb1mb2 * j1uarraytot_r[ma1][mb1];
-            const double uat_i_ma2mb2 = cga_mb1mb2 * j2uarraytot_i[ma2][mb2];
-            const double uat_i_ma1mb1 = cga_mb1mb2 * j1uarraytot_i[ma1][mb1];
-
-            for(int k = 0; k < 3; k++) {
-              sumb1_r[k] += dudr1_r[k] * uat_r_ma2mb2;
-              sumb1_r[k] -= dudr1_i[k] * uat_i_ma2mb2;
-              sumb1_i[k] += dudr1_r[k] * uat_i_ma2mb2;
-              sumb1_i[k] += dudr1_i[k] * uat_r_ma2mb2;
-
-              sumb1_r[k] += dudr2_r[k] * uat_r_ma1mb1;
-              sumb1_r[k] -= dudr2_i[k] * uat_i_ma1mb1;
-              sumb1_i[k] += dudr2_r[k] * uat_i_ma1mb1;
-              sumb1_i[k] += dudr2_i[k] * uat_r_ma1mb1;
-            }
-          } // end loop over mb1,mb2
-
-          // dzdr += sumb1*cg(j1,ma1,j2,ma2,j)
-
-          dzdr_r[0] += sumb1_r[0] * j1j2jcgarray[ma1][ma2];
-          dzdr_r[1] += sumb1_r[1] * j1j2jcgarray[ma1][ma2];
-          dzdr_r[2] += sumb1_r[2] * j1j2jcgarray[ma1][ma2];
-          dzdr_i[0] += sumb1_i[0] * j1j2jcgarray[ma1][ma2];
-          dzdr_i[1] += sumb1_i[1] * j1j2jcgarray[ma1][ma2];
-          dzdr_i[2] += sumb1_i[2] * j1j2jcgarray[ma1][ma2];
-        } // end loop over ma1,ma2
-
-        // dbdr(j1,j2,j) +=
-        //   Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb) +
-        //   Conj(u(j,ma,mb))*dzdr
-
-        dudr_r = duarray_r[j][ma][mb];
-        dudr_i = duarray_i[j][ma][mb];
-
-        for(int k = 0; k < 3; k++)
-          dbdr[k] +=
-            (dudr_r[k] * zarray_r[j1][j2][j][ma][mb] +
-             dudr_i[k] * zarray_i[j1][j2][j][ma][mb]) +
-            (uarraytot_r[j][ma][mb] * dzdr_r[k] +
-             uarraytot_i[j][ma][mb] * dzdr_i[k]);
-      } //end loop over ma mb
-
-  } //end loop over j1 j2 j
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[4] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
-
 }

 /* ----------------------------------------------------------------------
@ -958,14 +699,10 @@ void SNA::compute_dbidrj()
  double jjjmambzarray_r;
  double jjjmambzarray_i;

-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
-
-  for(int JJ = 0; JJ < idxj_max; JJ++) {
-    const int j1 = idxj[JJ].j1;
-    const int j2 = idxj[JJ].j2;
-    const int j = idxj[JJ].j;
+  for(int jjb = 0; jjb < idxb_max; jjb++) {
+    const int j1 = idxb[jjb].j1;
+    const int j2 = idxb[jjb].j2;
+    const int j = idxb[jjb].j;

    dbdr = dbarray[j1][j2][j];
    dbdr[0] = 0.0;
@ -1149,12 +886,6 @@ void SNA::compute_dbidrj()

  } //end loop over j1 j2 j

-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[4] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
-
 }

 /* ----------------------------------------------------------------------
@ -1251,27 +982,6 @@ void SNA::add_uarraytot(double r, double wj, double rcut)
      }
 }

-void SNA::add_uarraytot_omp(double r, double wj, double rcut)
-{
-  double sfac;
-
-  sfac = compute_sfac(r, rcut);
-
-  sfac *= wj;
-
-#if defined(_OPENMP)
-#pragma omp for
-#endif
-  for (int j = 0; j <= twojmax; j++)
-    for (int ma = 0; ma <= j; ma++)
-      for (int mb = 0; mb <= j; mb++) {
-        uarraytot_r[j][ma][mb] +=
-          sfac * uarray_r[j][ma][mb];
-        uarraytot_i[j][ma][mb] +=
-          sfac * uarray_i[j][ma][mb];
-      }
-}
-
 /* ----------------------------------------------------------------------
   compute Wigner U-functions for one neighbor
 ------------------------------------------------------------------------- */
@ -1348,88 +1058,6 @@ void SNA::compute_uarray(double x, double y, double z,
  }
 }

-void SNA::compute_uarray_omp(double x, double y, double z,
-                             double z0, double r, int /*sub_threads*/)
-{
-  double r0inv;
-  double a_r, b_r, a_i, b_i;
-  double rootpq;
-
-  // compute Cayley-Klein parameters for unit quaternion
-
-  r0inv = 1.0 / sqrt(r * r + z0 * z0);
-  a_r = r0inv * z0;
-  a_i = -r0inv * z;
-  b_r = r0inv * y;
-  b_i = -r0inv * x;
-
-  // VMK Section 4.8.2
-
-  uarray_r[0][0][0] = 1.0;
-  uarray_i[0][0][0] = 0.0;
-
-  for (int j = 1; j <= twojmax; j++) {
-#if defined(_OPENMP)
-#pragma omp for
-#endif
-    for (int mb = 0; mb < j; mb++) {
-      uarray_r[j][0][mb] = 0.0;
-      uarray_i[j][0][mb] = 0.0;
-
-      for (int ma = 0; ma < j; ma++) {
-        rootpq = rootpqarray[j - ma][j - mb];
-        uarray_r[j][ma][mb] +=
-          rootpq *
-          (a_r * uarray_r[j - 1][ma][mb] +
-           a_i * uarray_i[j - 1][ma][mb]);
-        uarray_i[j][ma][mb] +=
-          rootpq *
-          (a_r * uarray_i[j - 1][ma][mb] -
-           a_i * uarray_r[j - 1][ma][mb]);
-
-        rootpq = rootpqarray[ma + 1][j - mb];
-        uarray_r[j][ma + 1][mb] =
-          -rootpq *
-          (b_r * uarray_r[j - 1][ma][mb] +
-           b_i * uarray_i[j - 1][ma][mb]);
-        uarray_i[j][ma + 1][mb] =
-          -rootpq *
-          (b_r * uarray_i[j - 1][ma][mb] -
-           b_i * uarray_r[j - 1][ma][mb]);
-      }
-    }
-
-    int mb = j;
-    uarray_r[j][0][mb] = 0.0;
-    uarray_i[j][0][mb] = 0.0;
-
-#if defined(_OPENMP)
-#pragma omp for
-#endif
-    for (int ma = 0; ma < j; ma++) {
-      rootpq = rootpqarray[j - ma][mb];
-      uarray_r[j][ma][mb] +=
-        rootpq *
-        (b_r * uarray_r[j - 1][ma][mb - 1] -
-         b_i * uarray_i[j - 1][ma][mb - 1]);
-      uarray_i[j][ma][mb] +=
-        rootpq *
-        (b_r * uarray_i[j - 1][ma][mb - 1] +
-         b_i * uarray_r[j - 1][ma][mb - 1]);
-
-      rootpq = rootpqarray[ma + 1][mb];
-      uarray_r[j][ma + 1][mb] =
-        rootpq *
-        (a_r * uarray_r[j - 1][ma][mb - 1] -
-         a_i * uarray_i[j - 1][ma][mb - 1]);
-      uarray_i[j][ma + 1][mb] =
-        rootpq *
-        (a_r * uarray_i[j - 1][ma][mb - 1] +
-         a_i * uarray_r[j - 1][ma][mb - 1]);
-    }
-  }
-}
-
 /* ----------------------------------------------------------------------
   compute derivatives of Wigner U-functions for one neighbor
   see comments in compute_uarray()
@ -1644,20 +1272,18 @@ void SNA::create_twojmax_arrays()
    bzero = NULL;


-  if(!use_shared_arrays) {
-    memory->create(uarraytot_r, jdim, jdim, jdim,
-                   "sna:uarraytot");
-    memory->create(zarray_r, jdim, jdim, jdim, jdim, jdim,
-                   "sna:zarray");
-    memory->create(uarraytot_i, jdim, jdim, jdim,
-                   "sna:uarraytot");
-    memory->create(zarray_i, jdim, jdim, jdim, jdim, jdim,
-                   "sna:zarray");
-    memory->create(yarray_r, jdim, jdim, jdim,
-                   "sna:yarray");
-    memory->create(yarray_i, jdim, jdim, jdim,
-                   "sna:yarray");
-  }
+  memory->create(uarraytot_r, jdim, jdim, jdim,
+                 "sna:uarraytot");
+  memory->create(zarray_r, jdim, jdim, jdim, jdim, jdim,
+                 "sna:zarray");
+  memory->create(uarraytot_i, jdim, jdim, jdim,
+                 "sna:uarraytot");
+  memory->create(zarray_i, jdim, jdim, jdim, jdim, jdim,
+                 "sna:zarray");
+  memory->create(yarray_r, jdim, jdim, jdim,
+                 "sna:yarray");
+  memory->create(yarray_i, jdim, jdim, jdim,
+                 "sna:yarray");

 }

@ -1680,14 +1306,12 @@ void SNA::destroy_twojmax_arrays()
  if (bzero_flag)
    memory->destroy(bzero);

-  if(!use_shared_arrays) {
-    memory->destroy(uarraytot_r);
-    memory->destroy(zarray_r);
-    memory->destroy(uarraytot_i);
-    memory->destroy(zarray_i);
-    memory->destroy(yarray_r);
-    memory->destroy(yarray_i);
-  }
+  memory->destroy(uarraytot_r);
+  memory->destroy(zarray_r);
+  memory->destroy(uarraytot_i);
+  memory->destroy(zarray_i);
+  memory->destroy(yarray_r);
+  memory->destroy(yarray_i);
 }

 /* ----------------------------------------------------------------------
--- a/src/SNAP/sna.h
+++ b/src/SNAP/sna.h
@ -24,14 +24,19 @@

 namespace LAMMPS_NS {

-struct SNA_LOOPINDICES {
+struct SNA_ZINDICES {
+  int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju;
+  double betaj;
+};
+
+struct SNA_BINDICES {
  int j1, j2, j;
 };

 class SNA : protected Pointers {

 public:
-  SNA(LAMMPS*, double, int, int, int, double, int, int);
+  SNA(LAMMPS*, double, int, int, double, int, int);

  SNA(LAMMPS* lmp) : Pointers(lmp) {};
  ~SNA();
@ -44,10 +49,8 @@ public:
  // functions for bispectrum coefficients

  void compute_ui(int);
-  void compute_ui_omp(int, int);
  void compute_zi();
-  void compute_zi_omp(int);
-  void compute_yi(double*);
+  void compute_yi(const double*);
  void compute_bi();
  void copy_bi2bvec();

@ -56,20 +59,10 @@ public:
  void compute_duidrj(double*, double, double);
  void compute_dbidrj();
  void compute_deidrj(double*);
-  void compute_dbidrj_nonsymm();
  void copy_dbi2dbvec();
  double compute_sfac(double, double);
  double compute_dsfac(double, double);

-#ifdef TIMING_INFO
-  double* timers;
-  timespec starttime, endtime;
-  int print;
-  int counter;
-#endif
-
-  //per sna class instance for OMP use
-
  double* bvec, ** dbvec;
  double** rij;
  int* inside;
@ -83,16 +76,17 @@ public:
  double*** uarraytot_r, *** uarraytot_i;
  double***** zarray_r, ***** zarray_i;
  double*** yarray_r, *** yarray_i;
-  double*** uarraytot_r_b, *** uarraytot_i_b;
-  double***** zarray_r_b, ***** zarray_i_b;
  double*** uarray_r, *** uarray_i;

 private:
  double rmin0, rfac0;

-  //use indexlist instead of loops, constructor generates these
-  SNA_LOOPINDICES* idxj;
-  int idxj_max;
+  // use indexlist instead of loops, constructor generates these
+
+  SNA_ZINDICES* idxz;
+  SNA_BINDICES* idxb;
+  int idxcg_max, idxu_max, idxz_max, idxb_max;
+
  // data for bispectrum coefficients

  double***** cgarray;
@ -104,6 +98,21 @@ private:
  double**** duarray_r, **** duarray_i;
  double**** dbarray;

+  double* cglist;  
+  int*** idxcg_block; 
+
+  double* ulisttot_r, * ulisttot_i;
+  double* ulist_r, * ulist_i;
+  int* idxu_block;
+
+  double* zlist_r, * zlist_i;
+  int*** idxz_block;
+
+  int*** idxb_block;
+
+  double** dulist_r, ** dulist_i;
+  double* ylist_r, * ylist_i;
+
  static const int nmaxfactorial = 167;
  static const double nfac_table[];
  double factorial(int);
@ -118,22 +127,13 @@ private:
  void zero_uarraytot();
  void addself_uarraytot(double);
  void add_uarraytot(double, double, double);
-  void add_uarraytot_omp(double, double, double);
  void compute_uarray(double, double, double,
                      double, double);
-  void compute_uarray_omp(double, double, double,
-                          double, double, int);
  double deltacg(int, int, int);
  int compute_ncoeff();
  void compute_duarray(double, double, double,
                       double, double, double, double, double);

-  // if number of atoms are small use per atom arrays
-  // for twojmax arrays, rij, inside, bvec
-  // this will increase the memory footprint considerably,
-  // but allows parallel filling and reuse of these arrays
-  int use_shared_arrays;
-
  // Sets the style for the switching function
  // 0 = none
  // 1 = cosine