Merge branch 'master' into collected-small-fixes

2020-03-25 06:50:11 -04:00 · 2020-03-25 06:50:11 -04:00 · 3704d90efb
parent 0060473cee 91c1ae8077
commit 3704d90efb
10 changed files with 367 additions and 485 deletions
--- a/doc/src/Howto_bash.rst
+++ b/doc/src/Howto_bash.rst
@ -12,6 +12,10 @@ via apt-get and all files are accessible in both the Windows Explorer and your
 Linux shell (bash). This avoids switching to a different operating system or
 installing a virtual machine. Everything runs on Windows.
 .. seealso::
   You can find more detailed information at the `Windows Subsystem for Linux Installation Guide for Windows 10 <https://docs.microsoft.com/en-us/windows/wsl/install-win10>`_.
 Installing Bash on Windows
 --------------------------
@ -103,7 +107,7 @@ needed for various LAMMPS features:
 .. code-block:: bash
-   sudo apt install -y build-essential ccache gfortran openmpi-bin libopenmpi-dev libfftw3-dev libjpeg-dev libpng12-dev python-dev python-virtualenv libblas-dev liblapack-dev libhdf5-serial-dev hdf5-tools
+   sudo apt install -y build-essential ccache gfortran openmpi-bin libopenmpi-dev libfftw3-dev libjpeg-dev libpng-dev python-dev python-virtualenv libblas-dev liblapack-dev libhdf5-serial-dev hdf5-tools
 Files in Ubuntu on Windows
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
--- a/doc/src/fix_numdiff.rst
+++ b/doc/src/fix_numdiff.rst
@ -29,14 +29,14 @@ Description
 Calculate forces through finite difference calculations of energy
 versus position.  These forces can be compared to analytic forces
-computed by pair styles, bond styles, etc.  E.g. for debugging
+computed by pair styles, bond styles, etc.  This can be useful for
-purposes.
+debugging or other purposes.
 The group specified with the command means only atoms within the group
 have their averages computed.  Results are set to 0.0 for atoms not in
 the group.
-This fix performs a loop over all atoms (in the group).  For each atom
+This fix performs a loop over all atoms in the group.  For each atom
 and each component of force it adds *delta* to the position, and
 computes the new energy of the entire system.  It then subtracts
 *delta* from the original position and again computes the new energy
@ -66,10 +66,10 @@ by two times *delta*.
 .. note::
   The cost of each energy evaluation is essentially the cost of an MD
-   timestep.  This invoking this fix once has a cost of 2N timesteps,
+   timestep.  Thus invoking this fix once for a 3d system has a cost
-   where N is the total number of atoms in the system (assuming all atoms
+   of 6N timesteps, where N is the total number of atoms in the system
-   are included in the group).  So this fix can be very expensive to use
+   (assuming all atoms are included in the group).  So this fix can be
-   for large systems.
+   very expensive to use for large systems.
 ----------
--- a/examples/README
+++ b/examples/README
@ -93,6 +93,7 @@ msst:     MSST shock dynamics
 nb3b:     use of nonbonded 3-body harmonic pair style
 neb:      nudged elastic band (NEB) calculation for barrier finding
 nemd:     non-equilibrium MD of 2d sheared system
 numdiff: numerical difference computation of forces
 obstacle: flow around two voids in a 2d channel
 peptide:  dynamics of a small solvated peptide chain (5-mer)
 peri:     Peridynamic model of cylinder impacted by indenter
--- a/src/KOKKOS/pair_snap_kokkos.h
+++ b/src/KOKKOS/pair_snap_kokkos.h
@ -37,15 +37,13 @@ struct TagPairSNAPBeta{};
 struct TagPairSNAPComputeNeigh{};
 struct TagPairSNAPPreUi{};
 struct TagPairSNAPComputeUi{};
 struct TagPairSNAPComputeUiTot{}; // accumulate ulist into ulisttot separately
 struct TagPairSNAPComputeUiCPU{};
 struct TagPairSNAPComputeZi{};
 struct TagPairSNAPComputeBi{};
 struct TagPairSNAPZeroYi{};
 struct TagPairSNAPComputeYi{};
-struct TagPairSNAPComputeDuidrj{};
+struct TagPairSNAPComputeFusedDeidrj{};
 struct TagPairSNAPComputeDuidrjCPU{};
 struct TagPairSNAPComputeDeidrj{};
 struct TagPairSNAPComputeDeidrjCPU{};
 template<class DeviceType>
@ -83,9 +81,6 @@ public:
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::member_type& team) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
@ -102,14 +97,11 @@ public:
  void operator() (TagPairSNAPComputeYi,const int& ii) const;
  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const;
+  void operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::member_type& team) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const;
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@ -30,7 +30,6 @@
 #include "kokkos.h"
 #include "sna.h"
 #define MAXLINE 1024
 #define MAXWORD 3
@ -255,26 +254,19 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
      // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
        // 2 is for double buffer
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
      const int tile_size = (twojmax+1)*(twojmax+1);
      typedef Kokkos::View< SNAcomplex*,
                            Kokkos::DefaultExecutionSpace::scratch_memory_space,
                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >
              ScratchViewType;
-      int scratch_size = ScratchViewType::shmem_size( 2 * team_size * (twojmax+1)*(twojmax+1));
+      int scratch_size = ScratchViewType::shmem_size( 2 * team_size * tile_size );
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
      policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
      Kokkos::parallel_for("ComputeUi",policy_ui,*this);
      // ComputeUitot
      vector_length = 1;
      team_size = 128;
      team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::team_size_max(*this);
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot> policy_ui_tot(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
      Kokkos::parallel_for("ComputeUiTot",policy_ui_tot,*this);
    }
@ -316,7 +308,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
    typename Kokkos::RangePolicy<DeviceType, TagPairSNAPComputeYi> policy_yi(0,chunk_size*idxz_max);
    Kokkos::parallel_for("ComputeYi",policy_yi,*this);
-    //ComputeDuidrj
+    //ComputeDuidrj and Deidrj
    if (lmp->kokkos->ngpus == 0) { // CPU
      int vector_length = 1;
      int team_size = 1;
@ -324,53 +316,37 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
      snaKK.set_dir(-1); // technically doesn't do anything
      Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
    } else { // GPU, utilize scratch memory and splitting over dimensions
-      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::team_size_max(*this);
+      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
      Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
    } else { // GPU, utilize scratch memory and splitting over dimensions, fused dui and dei
      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::team_size_max(*this);
      int vector_length = 32;
      int team_size = 2; // need to cap b/c of shared memory reqs
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;
-      // scratch size: 2 * 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
+      // scratch size: 2 * 2 * team_size * (twojmax+1)*(twojmax/2+1), to cover half `m1`,`m2` values due to symmetry
      // 2 is for double buffer
      const int tile_size = (twojmax+1)*(twojmax/2+1);
      typedef Kokkos::View< SNAcomplex*,
                            Kokkos::DefaultExecutionSpace::scratch_memory_space,
                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >
              ScratchViewType;
      int scratch_size = ScratchViewType::shmem_size( 4 * team_size * tile_size);
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj> policy_fused_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
      policy_fused_deidrj = policy_fused_deidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
      int scratch_size = ScratchViewType::shmem_size( 4 * team_size * (twojmax+1)*(twojmax+1));
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj> policy_duidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
      policy_duidrj = policy_duidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
      // Need to call three times, once for each direction
      for (int k = 0; k < 3; k++) {
        snaKK.set_dir(k);
-        Kokkos::parallel_for("ComputeDuidrj",policy_duidrj,*this);
+        Kokkos::parallel_for("ComputeFusedDeidrj",policy_fused_deidrj,*this);
      }
    }
    //ComputeDeidrj
    if (lmp->kokkos->ngpus == 0) { // CPU
      int vector_length = 1;
      int team_size = 1;
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
      Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
    } else { // GPU, different loop strategy internally
      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::team_size_max(*this);
      int vector_length = 32; // coalescing disaster right now, will fix later
      int team_size = 8;
      if (team_size*vector_length > team_size_max)
        team_size = team_size_max/vector_length;
      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj> policy_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
      Kokkos::parallel_for("ComputeDeidrj",policy_deidrj,*this);
    }
    //ComputeForce
    if (eflag) {
      if (neighflag == HALF) {
@ -642,25 +618,6 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
  my_sna.compute_ui(team,ii,jj);
 }
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;
  // Extract the quantum number
  const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
  if (idx >= my_sna.idxu_max) return;
  // Extract the atomic index
  const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
  if (ii >= chunk_size) return;
  // Extract the number of neighbors neighbor number
  const int ninside = d_ninside(ii);
  my_sna.compute_uitot(team,idx,ii,ninside);
 }
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const {
@ -718,7 +675,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBi,const typename
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const {
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;
  // Extract the atom number
@ -730,7 +687,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const type
  const int ninside = d_ninside(ii);
  if (jj >= ninside) return;
-  my_sna.compute_duidrj(team,ii,jj);
+  my_sna.compute_fused_deidrj(team,ii,jj);
 }
 template<class DeviceType>
@ -750,24 +707,6 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrjCPU,const t
  my_sna.compute_duidrj_cpu(team,ii,jj);
 }
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const {
  SNAKokkos<DeviceType> my_sna = snaKK;
  // Extract the atom number
  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
  if (ii >= chunk_size) return;
  // Extract the neighbor number
  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
  const int ninside = d_ninside(ii);
  if (jj >= ninside) return;
  my_sna.compute_deidrj(team,ii,jj);
 }
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const {
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@ -135,14 +135,10 @@ inline
  KOKKOS_INLINE_FUNCTION
  void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
-  void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
+  void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_ui_orig(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_uitot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, int); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_zi(const int&);    // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void zero_yi(const int&,const int&); // ForceSNAP
@ -155,12 +151,10 @@ inline
  // functions for derivatives
  KOKKOS_INLINE_FUNCTION
-  void compute_duidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
+  void compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); //ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  double compute_sfac(double, double); // add_uarraytot, compute_duarray
@ -251,10 +245,6 @@ inline
  KOKKOS_INLINE_FUNCTION
  void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, double, double, double); // compute_ui
  KOKKOS_INLINE_FUNCTION
  void compute_uarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
                      double, double, double,
                      double, double); // compute_ui
  KOKKOS_INLINE_FUNCTION
  void compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
                      double, double, double,
@ -267,12 +257,8 @@ inline
 inline
  int compute_ncoeff();           // SNAKokkos()
  KOKKOS_INLINE_FUNCTION
  void compute_duarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
                       double, double, double, // compute_duidrj
                       double, double, double, double, double);
  KOKKOS_INLINE_FUNCTION
  void compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
-                       double, double, double, // compute_duidrj
+                       double, double, double, // compute_duidrj_cpu
                       double, double, double, double, double);
  // Sets the style for the switching function
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@ -19,6 +19,7 @@
 #include <cmath>
 #include <cstring>
 #include <cstdlib>
 #include <type_traits>
 namespace LAMMPS_NS {
@ -231,11 +232,22 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
  zlist = t_sna_2c_ll("sna:zlist",idxz_max,natom);
  //ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max);
 #ifdef KOKKOS_ENABLE_CUDA
  if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
    // dummy allocation
    ulist = t_sna_3c_ll("sna:ulist",1,1,1);
    dulist = t_sna_4c_ll("sna:dulist",1,1,1);
  } else {
 #endif
    ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax);
    dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax);
 #ifdef KOKKOS_ENABLE_CUDA
  }
 #endif
  //ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max);
  ylist = t_sna_2c_ll("sna:ylist",idxu_max,natom);
  //dulist = t_sna_4c("sna:dulist",natom,nmax,idxu_max);
  dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax);
 }
@ -269,14 +281,14 @@ void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>
 }
 /* ----------------------------------------------------------------------
-   compute Ui by summing over bispectrum components
+   compute Ui by computing Wigner U-functions for one neighbor and
   accumulating to the total. GPU only.
 ------------------------------------------------------------------------- */
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom, const int jnbor)
 {
  double rsq, r, x, y, z, z0, theta0;
  // utot(j,ma,mb) = 0 for all j,ma,ma
  // utot(j,ma,ma) = 1 for all j,ma
@ -284,22 +296,143 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
  //   compute r0 = (x,y,z,z0)
  //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
-  x = rij(iatom,jnbor,0);
+  // get shared memory offset
-  y = rij(iatom,jnbor,1);
+  const int max_m_tile = (twojmax+1)*(twojmax+1);
-  z = rij(iatom,jnbor,2);
+  const int team_rank = team.team_rank();
-  rsq = x * x + y * y + z * z;
+  const int scratch_shift = team_rank * max_m_tile;
  r = sqrt(rsq);
-  theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
+  // double buffer
  SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
  SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
  const double x = rij(iatom,jnbor,0);
  const double y = rij(iatom,jnbor,1);
  const double z = rij(iatom,jnbor,2);
  const double wj_local = wj(iatom, jnbor);
  const double rcut = rcutij(iatom, jnbor);
  const double rsq = x * x + y * y + z * z;
  const double r = sqrt(rsq);
  const double theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
  //    theta0 = (r - rmin0) * rscale0;
-  z0 = r / tan(theta0);
+  const double cs = cos(theta0);
  const double sn = sin(theta0);
  const double z0 = r * cs / sn; // r / tan(theta0)
-  compute_uarray(team, iatom, jnbor, x, y, z, z0, r);
+  // Compute cutoff function
  const double sfac = compute_sfac(r, rcut) * wj_local;
-  // if we're on the GPU, accumulating into uarraytot is done in a separate kernel.
+  // compute Cayley-Klein parameters for unit quaternion,
-  // if we're not, it's more efficient to include it in compute_uarray.
+  // pack into complex number
  const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
  const SNAcomplex a = { r0inv * z0, -r0inv * z };
  const SNAcomplex b = { r0inv * y, -r0inv * x };
  // VMK Section 4.8.2
  // All writes go to global memory and shared memory
  // so we can avoid all global memory reads
  Kokkos::single(Kokkos::PerThread(team), [=]() {
    //ulist(0,iatom,jnbor) = { 1.0, 0.0 };
    buf1[0] = {1.,0.};
    Kokkos::atomic_add(&(ulisttot(0,iatom).re), sfac);
  });
  for (int j = 1; j <= twojmax; j++) {
    const int jju = idxu_block[j];
    const int jjup = idxu_block[j-1];
    // fill in left side of matrix layer from previous layer
    // Flatten loop over ma, mb, need to figure out total
    // number of iterations
    // for (int ma = 0; ma <= j; ma++)
    const int n_ma = j+1;
    // for (int mb = 0; 2*mb <= j; mb++)
    const int n_mb = j/2+1;
    // the last (j / 2) can be avoided due to symmetry
    const int total_iters = n_ma * n_mb - (j % 2 == 0 ? (j / 2) : 0);
    //for (int m = 0; m < total_iters; m++) {
    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
      [&] (const int m) {
      // ma fast, mb slow
      int ma = m % n_ma;
      int mb = m / n_ma;
      // index into global memory array
      const int jju_index = jju+m;
      //const int jjup_index = jjup+mb*j+ma;
      // index into shared memory buffer for this level
      const int jju_shared_idx = m;
      // index into shared memory buffer for next level
      const int jjup_shared_idx = jju_shared_idx - mb;
      SNAcomplex u_accum = {0., 0.};
      // VMK recursion relation: grab contribution which is multiplied by b*
      const double rootpq2 = -rootpqarray(ma, j - mb);
      const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
      //const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist(jjup_index-1,iatom,jnbor):SNAcomplex(0.,0.);
      caconjxpy(b, u_up2, u_accum);
      // VMK recursion relation: grab contribution which is multiplied by a*
      const double rootpq1 = rootpqarray(j - ma, j - mb);
      const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
      //const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist(jjup_index,iatom,jnbor):SNAcomplex(0.,0.);
      caconjxpy(a, u_up1, u_accum);
      //ulist(jju_index,iatom,jnbor) = u_accum;
      // back up into shared memory for next iter
      buf2[jju_shared_idx] = u_accum;
      Kokkos::atomic_add(&(ulisttot(jju_index,iatom).re), sfac * u_accum.re);
      Kokkos::atomic_add(&(ulisttot(jju_index,iatom).im), sfac * u_accum.im);
      // copy left side to right side with inversion symmetry VMK 4.4(2)
      // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
      // if j is even (-> physical j integer), last element maps to self, skip
      //if (!(m == total_iters - 1 && j % 2 == 0)) { 
      if (m < total_iters - 1 || j % 2 == 1) {
        const int sign_factor = (((ma+mb)%2==0)?1:-1);
        const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
        const int jjup_flip = jju + jju_shared_flip; // jju+(j+1-mb)*(j+1)-(ma+1);
        if (sign_factor == 1) {
          u_accum.im = -u_accum.im;
        } else {
          u_accum.re = -u_accum.re;
        }
        //ulist(jjup_flip,iatom,jnbor) = u_accum;
        buf2[jju_shared_flip] = u_accum;
        Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).re), sfac * u_accum.re);
        Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).im), sfac * u_accum.im);
      }
    });
    // In CUDA backend,
    // ThreadVectorRange has a __syncwarp (appropriately masked for 
    // vector lengths < 32) implict at the end
    // swap double buffers
    auto tmp = buf1; buf1 = buf2; buf2 = tmp;
  }
 }
 /* ----------------------------------------------------------------------
   compute Ui by summing over bispectrum components. CPU only.
 ------------------------------------------------------------------------- */
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
@ -327,40 +460,8 @@ void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<Dev
 }
 /* ----------------------------------------------------------------------
   compute UiTot by summing over neighbors
 ------------------------------------------------------------------------- */
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_uitot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int idx, int iatom, int ninside)
 {
  // fuse initialize in, avoid this load?
  SNAcomplex utot = ulisttot(idx, iatom);
  for (int jnbor = 0; jnbor < ninside; jnbor++) {
    const auto x = rij(iatom,jnbor,0);
    const auto y = rij(iatom,jnbor,1);
    const auto z = rij(iatom,jnbor,2);
    const auto rsq = x * x + y * y + z * z;
    const auto r = sqrt(rsq);
    const double wj_local = wj(iatom, jnbor);
    const double rcut = rcutij(iatom, jnbor);
    const double sfac = compute_sfac(r, rcut) * wj_local;
    auto ulist_local = ulist(idx, iatom, jnbor);
    utot.re += sfac * ulist_local.re;
    utot.im += sfac * ulist_local.im;
  }
  ulisttot(idx, iatom) = utot;
 }
 /* ----------------------------------------------------------------------
   compute Zi by summing over products of Ui
   not updated yet
 ------------------------------------------------------------------------- */
 template<class DeviceType>
@ -509,72 +610,203 @@ void SNAKokkos<DeviceType>::compute_yi(int iter,
 }
 /* ----------------------------------------------------------------------
-   compute dEidRj
+   Fused calculation of the derivative of Ui w.r.t. atom j
   and of dEidRj. GPU only.
 ------------------------------------------------------------------------- */
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom, const int jnbor)
 {
-  t_scalar3<double> final_sum;
+  // get shared memory offset
  const int max_m_tile = (twojmax+1)*(twojmax/2+1);
  const int team_rank = team.team_rank();
  const int scratch_shift = team_rank * max_m_tile;
-  // Like in ComputeUi/ComputeDuidrj, regular loop over j.
+  // double buffer for ulist
-  for (int j = 0; j <= twojmax; j++) {
+  SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
-    int jju = idxu_block(j);
+  SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
-    // Flatten loop over ma, mb, reduce w/in
+  // double buffer for dulist
  SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
  SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
  const double x = rij(iatom,jnbor,0);
  const double y = rij(iatom,jnbor,1);
  const double z = rij(iatom,jnbor,2);
  const double rsq = x * x + y * y + z * z;
  const double r = sqrt(rsq);
  const double rcut = rcutij(iatom, jnbor);
  const double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
  const double theta0 = (r - rmin0) * rscale0;
  const double cs = cos(theta0);
  const double sn = sin(theta0);
  const double z0 = r * cs / sn;
  const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
  const double wj_local = wj(iatom, jnbor);
  const double sfac = wj_local * compute_sfac(r, rcut);
  const double dsfac = wj_local * compute_dsfac(r, rcut);
  const double rinv = 1.0 / r;
  // extract a single unit vector
  const double u = (dir == 0 ? x * rinv : dir == 1 ? y * rinv : z * rinv);
  // Compute Cayley-Klein parameters for unit quaternion
  const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
  const SNAcomplex a = { r0inv * z0, -r0inv * z };
  const SNAcomplex b = { r0inv * y, -r0inv * x };
  const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
  const double dr0inv = dr0invdr * u;
  const double dz0 = dz0dr * u;
  const SNAcomplex da = { dz0 * r0inv + z0 * dr0inv,
                              - z * dr0inv + (dir == 2 ? - r0inv : 0.) };
  const SNAcomplex db = { y * dr0inv + (dir==1?r0inv:0.),
                              -x * dr0inv + (dir==0?-r0inv:0.) };
  // Accumulate the full contribution to dedr on the fly
  const double du_prod = dsfac * u; // chain rule
  const SNAcomplex y_local = ylist(0, iatom);
  // Symmetry factor of 0.5 b/c 0 element is on diagonal for even j==0
  double dedr_full_sum = 0.5 * du_prod * y_local.re;
  // single has a warp barrier at the end
  Kokkos::single(Kokkos::PerThread(team), [=]() {
    //dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here
    ulist_buf1[0] = {1., 0.};
    dulist_buf1[0] = {0., 0.};
  });
  for (int j = 1; j <= twojmax; j++) {
    int jju = idxu_block[j];
    int jjup = idxu_block[j-1];
    // flatten the loop over ma,mb
    // for (int ma = 0; ma <= j; ma++)
    const int n_ma = j+1;
    // for (int mb = 0; 2*mb <= j; mb++)
    const int n_mb = j/2+1;
    const int total_iters = n_ma * n_mb;
-    t_scalar3<double> sum;
+    double dedr_sum = 0.; // j-local sum
    //for (int m = 0; m < total_iters; m++) {
    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, total_iters),
-      [&] (const int m, t_scalar3<double>& sum_tmp) {
+      [&] (const int m, double& sum_tmp) {
      // ma fast, mb slow
      int ma = m % n_ma;
      int mb = m / n_ma;
-      // get index
+      const int jju_index = jju+m;
      const int jju_index = jju+mb+mb*j+ma;
      // get ylist, rescale last element by 0.5
      SNAcomplex y_local = ylist(jju_index,iatom);
      const SNAcomplex du_x = dulist(jju_index,iatom,jnbor,0);
      const SNAcomplex du_y = dulist(jju_index,iatom,jnbor,1);
      const SNAcomplex du_z = dulist(jju_index,iatom,jnbor,2);
      // Load y_local, apply the symmetry scaling factor
      // The "secret" of the shared memory optimization is it eliminates
      // all global memory reads to duidrj in lieu of caching values in
      // shared memory and otherwise always writing, making the kernel
      // ultimately compute bound. We take advantage of that by adding
      // some reads back in.
      auto y_local = ylist(jju_index,iatom);
      if (j % 2 == 0 && 2*mb == j) {
        if (ma == mb) { y_local = 0.5*y_local; }
-        else if (ma > mb) { y_local = { 0., 0. }; }
+        else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright
        // else the ma < mb gets "double counted", cancelling the 0.5.
      }
-      sum_tmp.x += du_x.re * y_local.re + du_x.im * y_local.im;
+      // index into shared memory
-      sum_tmp.y += du_y.re * y_local.re + du_y.im * y_local.im;
+      const int jju_shared_idx = m;
-      sum_tmp.z += du_z.re * y_local.re + du_z.im * y_local.im;
+      const int jjup_shared_idx = jju_shared_idx - mb;
-    }, sum); // end loop over flattened ma,mb
+      // Need to compute and accumulate both u and du (mayhaps, we could probably
      // balance some read and compute by reading u each time).
      SNAcomplex u_accum = { 0., 0. };
      SNAcomplex du_accum = { 0., 0. };
-    final_sum.x += sum.x;
+      const double rootpq2 = -rootpqarray(ma, j - mb);
-    final_sum.y += sum.y;
+      const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
-    final_sum.z += sum.z;
+      caconjxpy(b, u_up2, u_accum);
      const double rootpq1 = rootpqarray(j - ma, j - mb);
      const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
      caconjxpy(a, u_up1, u_accum);
      // Next, spin up du_accum
      const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.);
      caconjxpy(da, u_up1, du_accum);
      caconjxpy(a, du_up1, du_accum);
      const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.);
      caconjxpy(db, u_up2, du_accum);
      caconjxpy(b, du_up2, du_accum);
      // No need to save u_accum to global memory
      // Cache u_accum, du_accum to scratch memory.
      ulist_buf2[jju_shared_idx] = u_accum;
      dulist_buf2[jju_shared_idx] = du_accum;
      // Directly accumulate deidrj into sum_tmp
      //dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
      const SNAcomplex du_prod = ((dsfac * u)*u_accum) + (sfac*du_accum);
      sum_tmp += du_prod.re * y_local.re + du_prod.im * y_local.im;
      // copy left side to right side with inversion symmetry VMK 4.4(2)
      // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
      if (j%2==1 && mb+1==n_mb) {
        int sign_factor = (((ma+mb)%2==0)?1:-1); 
        //const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); // no longer needed b/c we don't update dulist
        const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
        if (sign_factor == 1) {
          u_accum.im = -u_accum.im;
          du_accum.im = -du_accum.im;
        } else {
          u_accum.re = -u_accum.re;
          du_accum.re = -du_accum.re;
        }
-  Kokkos::single(Kokkos::PerThread(team), [&] () {
+        // We don't need the second half of the tile for the deidrj accumulation.
-    dedr(iatom,jnbor,0) = final_sum.x*2.0;
+        // That's taken care of by the symmetry factor above.
-    dedr(iatom,jnbor,1) = final_sum.y*2.0;
+        //dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
    dedr(iatom,jnbor,2) = final_sum.z*2.0;
  });
        // We do need it for ortho polynomial generation, though
        ulist_buf2[jju_shared_flip] = u_accum;
        dulist_buf2[jju_shared_flip] = du_accum;
      }
    }, dedr_sum);
    // swap buffers
    auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp;
    tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp;
    // Accumulate dedr. This "should" be in a single, but 
    // a Kokkos::single call implies a warp sync, and we may
    // as well avoid that. This does no harm as long as the
    // final assignment is in a single block.
    //Kokkos::single(Kokkos::PerThread(team), [=]() {
      dedr_full_sum += dedr_sum; 
    //});
  }
  // Store the accumulated dedr.
  Kokkos::single(Kokkos::PerThread(team), [&] () {
      dedr(iatom,jnbor,dir) = dedr_full_sum*2.0;
  });
 }
 /* ----------------------------------------------------------------------
   compute dEidRj, CPU path only.
 ------------------------------------------------------------------------- */
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
@ -708,28 +940,6 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
   calculate derivative of Ui w.r.t. atom j
 ------------------------------------------------------------------------- */
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_duidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
 {
  double rsq, r, x, y, z, z0, theta0, cs, sn;
  double dz0dr;
  x = rij(iatom,jnbor,0);
  y = rij(iatom,jnbor,1);
  z = rij(iatom,jnbor,2);
  rsq = x * x + y * y + z * z;
  r = sqrt(rsq);
  double rscale0 = rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
  theta0 = (r - rmin0) * rscale0;
  cs = cos(theta0);
  sn = sin(theta0);
  z0 = r * cs / sn;
  dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
  compute_duarray(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor));
 }
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
@ -774,119 +984,6 @@ void SNAKokkos<DeviceType>::add_uarraytot(const typename Kokkos::TeamPolicy<Devi
   compute Wigner U-functions for one neighbor
 ------------------------------------------------------------------------- */
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
                         double x, double y, double z,
                         double z0, double r)
 {
  // define size of scratch memory buffer
  const int max_m_tile = (twojmax+1)*(twojmax+1);
  const int team_rank = team.team_rank();
  // get scratch memory double buffer
  SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
  SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
  // compute Cayley-Klein parameters for unit quaternion,
  // pack into complex number
  double r0inv = 1.0 / sqrt(r * r + z0 * z0);
  SNAcomplex a = { r0inv * z0, -r0inv * z };
  SNAcomplex b = { r0inv * y, -r0inv * x };
  // VMK Section 4.8.2
  // All writes go to global memory and shared memory
  // so we can avoid all global memory reads
  Kokkos::single(Kokkos::PerThread(team), [=]() {
    ulist(0,iatom,jnbor) = { 1.0, 0.0 };
    buf1[max_m_tile*team_rank] = {1.,0.};
  });
  for (int j = 1; j <= twojmax; j++) {
    const int jju = idxu_block[j];
    int jjup = idxu_block[j-1];
    // fill in left side of matrix layer from previous layer
    // Flatten loop over ma, mb, need to figure out total
    // number of iterations
    // for (int ma = 0; ma <= j; ma++)
    const int n_ma = j+1;
    // for (int mb = 0; 2*mb <= j; mb++)
    const int n_mb = j/2+1;
    const int total_iters = n_ma * n_mb;
    //for (int m = 0; m < total_iters; m++) {
    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
      [&] (const int m) {
      // ma fast, mb slow
      int ma = m % n_ma;
      int mb = m / n_ma;
      // index into global memory array
      const int jju_index = jju+mb+mb*j+ma;
      // index into shared memory buffer for previous level
      const int jju_shared_idx = max_m_tile*team_rank+mb+mb*j+ma;
      // index into shared memory buffer for next level
      const int jjup_shared_idx = max_m_tile*team_rank+mb*j+ma;
      SNAcomplex u_accum = {0., 0.};
      // VMK recursion relation: grab contribution which is multiplied by a*
      const double rootpq1 = rootpqarray(j - ma, j - mb);
      const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
      caconjxpy(a, u_up1, u_accum);
      // VMK recursion relation: grab contribution which is multiplied by b*
      const double rootpq2 = -rootpqarray(ma, j - mb);
      const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
      caconjxpy(b, u_up2, u_accum);
      ulist(jju_index,iatom,jnbor) = u_accum;
      // We no longer accumulate into ulisttot in this kernel.
      // Instead, we have a separate kernel which avoids atomics.
      // Running two separate kernels is net faster.
      // back up into shared memory for next iter
      if (j != twojmax) buf2[jju_shared_idx] = u_accum;
      // copy left side to right side with inversion symmetry VMK 4.4(2)
      // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
      // We can avoid this if we're on the last row for an integer j
      if (!(n_ma % 2 == 1 && (mb+1) == n_mb)) {
        int sign_factor = ((ma%2==0)?1:-1)*(mb%2==0?1:-1);
        const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1);
        const int jju_shared_flip = max_m_tile*team_rank+(j+1-mb)*(j+1)-(ma+1);
        if (sign_factor == 1) {
          u_accum.im = -u_accum.im;
        } else {
          u_accum.re = -u_accum.re;
        }
        ulist(jjup_flip,iatom,jnbor) = u_accum;
        if (j != twojmax) buf2[jju_shared_flip] = u_accum;
      }
    });
    // In CUDA backend,
    // ThreadVectorRange has a __syncwarp (appropriately masked for
    // vector lengths < 32) implicit at the end
    // swap double buffers
    auto tmp = buf1; buf1 = buf2; buf2 = tmp;
    //std::swap(buf1, buf2); // throws warnings
  }
 }
 // CPU version
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
@ -976,152 +1073,9 @@ void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy
 /* ----------------------------------------------------------------------
   compute derivatives of Wigner U-functions for one neighbor
-   see comments in compute_uarray()
+   see comments in compute_uarray_cpu()
 ------------------------------------------------------------------------- */
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_duarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
                          double x, double y, double z,
                          double z0, double r, double dz0dr,
                          double wj, double rcut)
 {
  // get shared memory offset
  const int max_m_tile = (twojmax+1)*(twojmax+1);
  const int team_rank = team.team_rank();
  // double buffer for ulist
  SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
  SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
  // double buffer for dulist
  SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
  SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
  const double sfac = wj * compute_sfac(r, rcut);
  const double dsfac = wj * compute_dsfac(r, rcut);
  const double rinv = 1.0 / r;
  // extract a single unit vector
  const double u = (dir == 0 ? x * rinv : dir == 1 ? y * rinv : z * rinv);
  // Compute Cayley-Klein parameters for unit quaternion
  const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
  const SNAcomplex a = { r0inv * z0, -r0inv * z };
  const SNAcomplex b = { r0inv * y, -r0inv * x };
  const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
  const double dr0inv = dr0invdr * u;
  const double dz0 = dz0dr * u;
  const SNAcomplex da = { dz0 * r0inv + z0 * dr0inv,
                              - z * dr0inv + (dir == 2 ? - r0inv : 0.) };
  const SNAcomplex db = { y * dr0inv + (dir==1?r0inv:0.),
                              -x * dr0inv + (dir==0?-r0inv:0.) };
  // single has a warp barrier at the end
  Kokkos::single(Kokkos::PerThread(team), [=]() {
    dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here
    ulist_buf1[max_m_tile*team_rank] = {1., 0.};
    dulist_buf1[max_m_tile*team_rank] = {0., 0.};
  });
  for (int j = 1; j <= twojmax; j++) {
    int jju = idxu_block[j];
    int jjup = idxu_block[j-1];
    // flatten the loop over ma,mb
    // for (int ma = 0; ma <= j; ma++)
    const int n_ma = j+1;
    // for (int mb = 0; 2*mb <= j; mb++)
    const int n_mb = j/2+1;
    const int total_iters = n_ma * n_mb;
    //for (int m = 0; m < total_iters; m++) {
    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
      [&] (const int m) {
      // ma fast, mb slow
      int ma = m % n_ma;
      int mb = m / n_ma;
      const int jju_index = jju+mb+mb*j+ma;
      // index into shared memory
      const int jju_shared_idx = max_m_tile*team_rank+mb+mb*j+ma;
      const int jjup_shared_idx = max_m_tile*team_rank+mb*j+ma;
      // Need to compute and accumulate both u and du (mayhaps, we could probably
      // balance some read and compute by reading u each time).
      SNAcomplex u_accum = { 0., 0. };
      SNAcomplex du_accum = { 0., 0. };
      const double rootpq1 = rootpqarray(j - ma, j - mb);
      const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
      caconjxpy(a, u_up1, u_accum);
      const double rootpq2 = -rootpqarray(ma, j - mb);
      const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
      caconjxpy(b, u_up2, u_accum);
      // No need to save u_accum to global memory
      if (j != twojmax) ulist_buf2[jju_shared_idx] = u_accum;
      // Next, spin up du_accum
      const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.);
      caconjxpy(da, u_up1, du_accum);
      caconjxpy(a, du_up1, du_accum);
      const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.);
      caconjxpy(db, u_up2, du_accum);
      caconjxpy(b, du_up2, du_accum);
      dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
      if (j != twojmax) dulist_buf2[jju_shared_idx] = du_accum;
      // copy left side to right side with inversion symmetry VMK 4.4(2)
      // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
      int sign_factor = ((ma%2==0)?1:-1)*(mb%2==0?1:-1);
      const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1);
      const int jju_shared_flip = max_m_tile*team_rank+(j+1-mb)*(j+1)-(ma+1);
      if (sign_factor == 1) {
        //ulist_alt(iatom,jnbor,jjup_flip).re = u_accum.re;
        //ulist_alt(iatom,jnbor,jjup_flip).im = -u_accum.im;
        u_accum.im = -u_accum.im;
        du_accum.im = -du_accum.im;
      } else {
        //ulist_alt(iatom,jnbor,jjup_flip).re = -u_accum.re;
        //ulist_alt(iatom,jnbor,jjup_flip).im = u_accum.im;
        u_accum.re = -u_accum.re;
        du_accum.re = -du_accum.re;
      }
      dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
      if (j != twojmax) {
        ulist_buf2[jju_shared_flip] = u_accum;
        dulist_buf2[jju_shared_flip] = du_accum;
      }
    });
    // swap buffers
    auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp;
    tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp;
  }
 }
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
@ -1680,11 +1634,17 @@ double SNAKokkos<DeviceType>::memory_usage()
  bytes += jdimpq*jdimpq * sizeof(double);               // pqarray
  bytes += idxcg_max * sizeof(double);                   // cglist
 #ifdef KOKKOS_ENABLE_CUDA
  if (!std::is_same<DeviceType,Kokkos::Cuda>::value) {
 #endif
    bytes += natom * idxu_max * sizeof(double) * 2;        // ulist
    bytes += natom * idxu_max * 3 * sizeof(double) * 2;    // dulist
 #ifdef KOKKOS_ENABLE_CUDA
  }
 #endif
  bytes += natom * idxu_max * sizeof(double) * 2;        // ulisttot
  if (!Kokkos::Impl::is_same<typename DeviceType::array_layout,Kokkos::LayoutRight>::value)
    bytes += natom * idxu_max * sizeof(double) * 2;        // ulisttot_lr
  bytes += natom * idxu_max * 3 * sizeof(double) * 2;    // dulist
  bytes += natom * idxz_max * sizeof(double) * 2;        // zlist
  bytes += natom * idxb_max * sizeof(double);            // blist
--- a/src/KSPACE/msm.cpp
+++ b/src/KSPACE/msm.cpp
@ -2920,7 +2920,7 @@ void MSM::compute_phis_and_dphis(const double &dx, const double &dy,
 /* ----------------------------------------------------------------------
   compute phi using interpolating polynomial
-   see Eq 7 from Parallel Computing 35 (2009) 164–177
+   see Eq 7 from Parallel Computing 35 (2009) 164-177
   and Hardy's thesis
 ------------------------------------------------------------------------- */
@ -2999,7 +2999,7 @@ inline double MSM::compute_phi(const double &xi)
 /* ----------------------------------------------------------------------
   compute the derivative of phi
   phi is an interpolating polynomial
-   see Eq 7 from Parallel Computing 35 (2009) 164–177
+   see Eq 7 from Parallel Computing 35 (2009) 164-177
   and Hardy's thesis
 ------------------------------------------------------------------------- */
--- a/src/USER-INTEL/pair_tersoff_intel.cpp
+++ b/src/USER-INTEL/pair_tersoff_intel.cpp
@ -12,7 +12,7 @@
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
-   Contributing author: Markus Höhnerbach (RWTH)
+   Contributing author: Markus Höhnerbach (RWTH)
 ------------------------------------------------------------------------- */
 #include <cmath>
--- a/src/USER-SMTBQ/pair_smtbq.cpp
+++ b/src/USER-SMTBQ/pair_smtbq.cpp
@ -13,7 +13,7 @@
 /* ----------------------------------------------------------------------
   The SMTBQ code has been developed with the financial support of  CNRS and
-   of the Regional Council of Burgundy (Convention n¡ 2010-9201AAO037S03129)
+   of the Regional Council of Burgundy (Convention n¡ 2010-9201AAO037S03129)
   Copyright (2015)
   Universite de Bourgogne : Nicolas SALLES, Olivier POLITANO
@ -943,7 +943,7 @@ void PairSMTBQ::compute(int eflag, int vflag)
     3 -> Short int. Ox-Ox
     4 -> Short int. SMTB (repulsion)
     5 -> Covalent energy SMTB
-     6 -> Somme des Q(i)²
+     6 -> Somme des Q(i)²
     ------------------------------------------------------------------------- */
  /* -------------- N-body forces Calcul --------------- */
@ -3022,7 +3022,7 @@ void PairSMTBQ::groupQEqAllParallel_QEq()
  ngp = igp = 0; nelt[ngp] = 0;
-  // On prend un oxygène
+  // On prend un oxygène
  //   printf ("[me %d] On prend un oxygene\n",me);
  for (ii = 0; ii < inum; ii++) {