Added support for symmetrized memory layouts for ui/duidrj for the CPU- and GPU-path SNAP Kokkos implementation, various perf optimizations for ComputeUi/ComputeFusedDeidrj

2020-08-12 16:15:06 -04:00 · 2020-08-12 16:15:06 -04:00 · a5d27763e5
parent ac43f8f685
commit a5d27763e5
5 changed files with 530 additions and 298 deletions
--- a/src/KOKKOS/kokkos_type.h
+++ b/src/KOKKOS/kokkos_type.h
@ -1058,7 +1058,7 @@ struct alignas(2*sizeof(real)) SNAComplex
 {
  real re,im;

-  KOKKOS_FORCEINLINE_FUNCTION SNAComplex() = default;
+  SNAComplex() = default;

  KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real re)
   : re(re), im(static_cast<real>(0.)) { ; }
@ -1100,6 +1100,17 @@ KOKKOS_FORCEINLINE_FUNCTION SNAComplex<real> operator*(const real& r, const SNAC

 typedef SNAComplex<SNAreal> SNAcomplex;

+// Cayley-Klein pack
+// Can guarantee it's aligned to 2 complex
+struct alignas(32) CayleyKleinPack {
+
+  SNAcomplex a, b;
+  SNAcomplex da[3], db[3];
+  SNAreal sfac;
+  SNAreal dsfacu[3];
+
+};
+

 #if defined(KOKKOS_ENABLE_CXX11)
 #undef ISFINITE
--- a/src/KOKKOS/pair_snap_kokkos.h
+++ b/src/KOKKOS/pair_snap_kokkos.h
@ -50,6 +50,7 @@ struct TagPairSNAPComputeFusedDeidrj{};
 // CPU backend only
 struct TagPairSNAPPreUiCPU{};
 struct TagPairSNAPComputeUiCPU{};
+struct TagPairSNAPTransformUiCPU{};
 struct TagPairSNAPComputeZiCPU{};
 struct TagPairSNAPBetaCPU{};
 struct TagPairSNAPComputeBiCPU{};
@ -104,7 +105,7 @@ public:
  void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const;
+  void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
@ -134,15 +135,15 @@ public:
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;

+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const;
+
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeZiCPU,const int& ii) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBiCPU>::member_type& team) const;

-  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYiCPU>::member_type& team) const;
-
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;

--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@ -206,8 +206,6 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)

  EV_FLOAT ev;

-  int idxu_max = snaKK.idxu_max;
-
  while (chunk_offset < inum) { // chunk up loop to prevent running out of memory

    EV_FLOAT ev_tmp;
@ -246,6 +244,13 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
      }

+      {
+        // Expand ulisttot -> ulisttot_full
+        // Zero out ylist
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUiCPU> policy_transform_ui_cpu({0,0},{twojmax+1,chunk_size});
+        Kokkos::parallel_for("TransformUiCPU",policy_transform_ui_cpu,*this);
+      }
+
      //Compute bispectrum
      if (quadraticflag || eflag) {
        //ComputeZi
@ -261,20 +266,12 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
      }

-      //ZeroYi,ComputeYi
+      //ComputeYi
      {
-        int vector_length = vector_length_default;
-        int team_size = team_size_default;
-
        //Compute beta = dE_i/dB_i for all i in list
        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBetaCPU> policy_beta(0,chunk_size);
        Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this);

-        //ZeroYi
-        check_team_size_for<TagPairSNAPZeroYiCPU>(chunk_size,team_size,vector_length);
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
-        Kokkos::parallel_for("ZeroYiCPU",policy_zero_yi,*this);
-
        //ComputeYi
        int idxz_max = snaKK.idxz_max;
        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiCPU> policy_yi_cpu(0,chunk_size*idxz_max);
@ -294,6 +291,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)

        Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
      }
+
    } else { // GPU

 #ifdef LMP_KOKKOS_GPU
@ -313,10 +311,10 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        int team_size = 4; // need to cap b/c of shared memory reqs
        check_team_size_for<TagPairSNAPComputeUi>(chunk_size,team_size,vector_length);

-        // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
+        // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values, div 2 for symmetry
        //   2 is for double buffer

-        const int tile_size = (twojmax+1)*(twojmax+1);
+        const int tile_size = (twojmax+1)*(twojmax/2+1);
        typedef Kokkos::View< SNAcomplex*,
                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
                              Kokkos::MemoryTraits<Kokkos::Unmanaged> >
@ -329,7 +327,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        Kokkos::parallel_for("ComputeUi",policy_ui,*this);

        //Transform data layout of ulisttot to AoSoA, zero ylist
-        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,twojmax+1,(chunk_size + 32 - 1) / 32},{32,4,1});
        Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);

      }
@ -367,7 +365,8 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);

        //Transform data layout of ylist out of AoSoA
-        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        const int idxu_half_max = snaKK.idxu_half_max;
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_half_max,(chunk_size + 32 - 1) / 32},{32,4,1});
        Kokkos::parallel_for("TransformYi",policy_transform_yi,*this);

      }
@ -397,7 +396,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
        }
      }

-#endif // KOKKOS_ENABLE_CUDA
+#endif // LMP_KOKKOS_GPU

    }

@ -608,12 +607,21 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen

    if ( rsq < rnd_cutsq(itype,jtype) ) {
      if (final) {
-        my_sna.rij(ii,offset,0) = dx;
-        my_sna.rij(ii,offset,1) = dy;
-        my_sna.rij(ii,offset,2) = dz;
+#ifdef LMP_KOKKOS_GPU
+        if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
+          my_sna.compute_cayley_klein(ii, offset, dx, dy, dz, (radi + d_radelem[elem_j])*rcutfac,
+                                      d_wjelem[elem_j]);
+        } else {
+#endif
+          my_sna.rij(ii,offset,0) = dx;
+          my_sna.rij(ii,offset,1) = dy;
+          my_sna.rij(ii,offset,2) = dz;
+          my_sna.wj(ii,offset) = d_wjelem[elem_j];
+          my_sna.rcutij(ii,offset) = (radi + d_radelem[elem_j])*rcutfac;
+#ifdef LMP_KOKKOS_GPU
+        }
+#endif
        my_sna.inside(ii,offset) = j;
-        my_sna.wj(ii,offset) = d_wjelem[elem_j];
-        my_sna.rcutij(ii,offset) = (radi + d_radelem[elem_j])*rcutfac;
        if (chemflag)
          my_sna.element(ii,offset) = elem_j;
        else
@ -704,27 +712,56 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename

 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  const int iatom = iatom_mod + iatom_div * 32;
  if (iatom >= chunk_size) return;

-  if (idxu >= my_sna.idxu_max) return;
+  if (j > twojmax) return; 

  int elem_count = chemflag ? nelements : 1;

  for (int ielem = 0; ielem < elem_count; ielem++) {
+    const int jju_half = my_sna.idxu_half_block(j);
+    const int jju = my_sna.idxu_block(j);

-    const auto utot_re = my_sna.ulisttot_re(idxu, ielem, iatom);
-    const auto utot_im = my_sna.ulisttot_im(idxu, ielem, iatom);
+    for (int mb = 0; 2*mb <= j; mb++) {
+      for (int ma = 0; ma <= j; ma++) {
+        // Extract top half

-    my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
+        const int idxu_shift = mb * (j + 1) + ma;
+        const int idxu_half = jju_half + idxu_shift;
+        const int idxu = jju + idxu_shift;

-    my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div) = 0.;
-    my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div) = 0.;
+        auto utot_re = my_sna.ulisttot_re(idxu_half, ielem, iatom);
+        auto utot_im = my_sna.ulisttot_im(idxu_half, ielem, iatom);
+
+        // Store
+        my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
+          
+        // Also zero yi
+        my_sna.ylist_pack_re(iatom_mod, idxu_half, ielem, iatom_div) = 0.;
+        my_sna.ylist_pack_im(iatom_mod, idxu_half, ielem, iatom_div) = 0.;
+
+        // Symmetric term
+        const int sign_factor = (((ma+mb)%2==0)?1:-1);
+        const int idxu_flip = jju + (j + 1 - mb) * (j + 1) - (ma + 1);
+
+        if (sign_factor == 1) {
+          utot_im = -utot_im;
+        } else {
+          utot_re = -utot_re;
+        }
+
+        my_sna.ulisttot_pack(iatom_mod, idxu_flip, ielem, iatom_div) = { utot_re, utot_im };
+
+        // No need to zero symmetrized ylist
+        //my_sna.ylist_pack_re(iatom_mod, idxu_flip, ielem, iatom_div) = 0.;
+        //my_sna.ylist_pack_im(iatom_mod, idxu_flip, ielem, iatom_div) = 0.;
+      }
+    }
  }
-
 }

 template<class DeviceType>
@ -742,20 +779,20 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int iato

 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const {
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu_half, const int iatom_div) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

  const int iatom = iatom_mod + iatom_div * 32;
  if (iatom >= chunk_size) return;

-  if (idxu >= my_sna.idxu_max) return;
+  if (idxu_half >= my_sna.idxu_half_max) return;

  int elem_count = chemflag ? nelements : 1;
  for (int ielem = 0; ielem < elem_count; ielem++) {
-    const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div);
-    const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div);
+    const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu_half, ielem, iatom_div);
+    const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu_half, ielem, iatom_div);

-    my_sna.ylist(idxu, ielem, iatom) = { y_re, y_im };
+    my_sna.ylist(idxu_half, ielem, iatom) = { y_re, y_im };
  }

 }
@ -904,22 +941,52 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typen

 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU>::member_type& team) const {
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const {
  SNAKokkos<DeviceType> my_sna = snaKK;

-  // Extract the quantum number
-  const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
-  if (idx >= my_sna.idxu_max) return;
+  if (iatom >= chunk_size) return;

-  // Extract the atomic index
-  const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
-  if (ii >= chunk_size) return;
+  if (j > twojmax) return; 

-  if (chemflag)
-    for(int ielem = 0; ielem < nelements; ielem++)
-      my_sna.zero_yi_cpu(idx,ii,ielem);
-  else
-    my_sna.zero_yi_cpu(idx,ii,0);
+  int elem_count = chemflag ? nelements : 1;
+
+  // De-symmetrize ulisttot
+  for (int ielem = 0; ielem < elem_count; ielem++) {
+
+    const int jju_half = my_sna.idxu_half_block(j);
+    const int jju = my_sna.idxu_block(j);
+
+    for (int mb = 0; 2*mb <= j; mb++) {
+      for (int ma = 0; ma <= j; ma++) {
+        // Extract top half
+
+        const int idxu_shift = mb * (j + 1) + ma;
+        const int idxu_half = jju_half + idxu_shift;
+        const int idxu = jju + idxu_shift;
+
+        // Load ulist
+        auto utot = my_sna.ulisttot(idxu_half, ielem, iatom);
+
+        // Store
+        my_sna.ulisttot_full(idxu, ielem, iatom) = utot;
+
+        // Zero Yi
+        my_sna.ylist(idxu_half, ielem, iatom) = {0., 0.};
+
+        // Symmetric term
+        const int sign_factor = (((ma+mb)%2==0)?1:-1);
+        const int idxu_flip = jju + (j + 1 - mb) * (j + 1) - (ma + 1);
+
+        if (sign_factor == 1) {
+          utot.im = -utot.im;
+        } else {
+          utot.re = -utot.re;
+        }
+
+        my_sna.ulisttot_full(idxu_flip, ielem, iatom) = utot;
+      }
+    }
+  }
 }

 template<class DeviceType>
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@ -55,6 +55,8 @@ public:
  typedef Kokkos::View<SNAcomplex**[3], DeviceType> t_sna_3c3;
  typedef Kokkos::View<SNAcomplex*****, DeviceType> t_sna_5c;

+  typedef Kokkos::View<CayleyKleinPack**, DeviceType> t_sna_2ckp; 
+
 inline
  SNAKokkos() {};
  KOKKOS_INLINE_FUNCTION
@ -78,6 +80,9 @@ inline

  // functions for bispectrum coefficients, GPU only
  KOKKOS_INLINE_FUNCTION
+  void compute_cayley_klein(const int&, const int&, const double&, const double&,
+                            const double&, const double&, const double&);
+  KOKKOS_INLINE_FUNCTION
  void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); // ForceSNAP
@ -97,8 +102,6 @@ inline
  KOKKOS_INLINE_FUNCTION
  void compute_zi_cpu(const int&);    // ForceSNAP
  KOKKOS_INLINE_FUNCTION
-  void zero_yi_cpu(const int&,const int&,const int&); // ForceSNAP
-  KOKKOS_INLINE_FUNCTION
  void compute_yi_cpu(int,
   const Kokkos::View<F_FLOAT**, DeviceType> &beta); // ForceSNAP
    KOKKOS_INLINE_FUNCTION
@ -117,6 +120,8 @@ inline
  double compute_sfac(double, double); // add_uarraytot, compute_duarray
  KOKKOS_INLINE_FUNCTION
  double compute_dsfac(double, double); // compute_duarray
+  KOKKOS_INLINE_FUNCTION
+  void compute_s_dsfac(const double, const double, double&, double&); // compute_cayley_klein

  // efficient complex FMA
  // efficient caxpy (i.e., y += a x)
@ -140,6 +145,9 @@ inline

  //per sna class instance for OMP use

+  // Alternative to rij, wj, rcutij...
+  // just calculate everything up front
+  t_sna_2ckp cayleyklein;

  // Per InFlight Particle
  t_sna_3d rij;
@ -156,6 +164,7 @@ inline

  t_sna_3d_ll blist;
  t_sna_3c_ll ulisttot;
+  t_sna_3c_ll ulisttot_full; // un-folded ulisttot, cpu only
  t_sna_3c_ll zlist;

  t_sna_3c_ll ulist;
@ -173,7 +182,7 @@ inline
  t_sna_4d_ll ylist_pack_re; // split real,
  t_sna_4d_ll ylist_pack_im; // imag AoSoA layout

-  int idxcg_max, idxu_max, idxz_max, idxb_max;
+  int idxcg_max, idxu_max, idxu_half_max, idxu_cache_max, idxz_max, idxb_max;

  // Chem snap counts
  int nelements;
@ -188,7 +197,13 @@ private:
  Kokkos::View<int*[10], DeviceType> idxz;
  Kokkos::View<int*[3], DeviceType> idxb;
  Kokkos::View<int***, DeviceType> idxcg_block;
+
+public:
  Kokkos::View<int*, DeviceType> idxu_block;
+  Kokkos::View<int*, DeviceType> idxu_half_block;
+  Kokkos::View<int*, DeviceType> idxu_cache_block;
+
+private:
  Kokkos::View<int***, DeviceType> idxz_block;
  Kokkos::View<int***, DeviceType> idxb_block;

--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@ -120,6 +120,36 @@ void SNAKokkos<DeviceType>::build_indexlist()
  idxu_max = idxu_count;
  Kokkos::deep_copy(idxu_block,h_idxu_block);

+  // index list for half uarray 
+  idxu_half_block = Kokkos::View<int*, DeviceType>("SNAKokkos::idxu_half_block",jdim);
+  auto h_idxu_half_block = Kokkos::create_mirror_view(idxu_half_block);
+
+  int idxu_half_count = 0;
+  for(int j = 0; j <= twojmax; j++) {
+    h_idxu_half_block[j] = idxu_half_count;
+    for(int mb = 0; 2*mb <= j; mb++)
+      for(int ma = 0; ma <= j; ma++)
+        idxu_half_count++;
+  }
+  idxu_half_max = idxu_half_count;
+  Kokkos::deep_copy(idxu_half_block, h_idxu_half_block);
+
+  // index list for "cache" uarray
+  // this is the GPU scratch memory requirements
+  // applied the CPU structures
+  idxu_cache_block = Kokkos::View<int*, DeviceType>("SNAKokkos::idxu_cache_block",jdim);
+  auto h_idxu_cache_block = Kokkos::create_mirror_view(idxu_cache_block);
+
+  int idxu_cache_count = 0;
+  for(int j = 0; j <= twojmax; j++) {
+    h_idxu_cache_block[j] = idxu_cache_count;
+    for(int mb = 0; mb < ((j+3)/2); mb++)
+      for (int ma = 0; ma <= j; ma++)
+        idxu_cache_count++;
+  }
+  idxu_cache_max = idxu_cache_count;
+  Kokkos::deep_copy(idxu_cache_block, h_idxu_cache_block);
+
  // index list for beta and B

  int idxb_count = 0;
@ -201,15 +231,21 @@ void SNAKokkos<DeviceType>::build_indexlist()
            h_idxz(idxz_count,8) = MIN(j1, (2 * mb - j + j2 + j1) / 2) - h_idxz(idxz_count,5) + 1;

            // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
+            // ylist is "compressed" via symmetry in its
+            // contraction with dulist
+            const int jju_half = h_idxu_half_block[j] + (j+1)*mb + ma;
+            h_idxz(idxz_count,9) = jju_half;

-            const int jju = h_idxu_block[j] + (j+1)*mb + ma;
-            h_idxz(idxz_count,9) = jju;
+            // no longer needed
+            //const int jju = h_idxu_block[j] + (j+1)*mb + ma;
+            //h_idxz(idxz_count,9) = jju;

            idxz_count++;
          }
      }
  Kokkos::deep_copy(idxz,h_idxz);
  Kokkos::deep_copy(idxz_block,h_idxz_block);
+
 }

 /* ---------------------------------------------------------------------- */
@ -230,44 +266,47 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
  natom = newnatom;
  nmax = newnmax;

-  rij = t_sna_3d("sna:rij",natom,nmax,3);
-  inside = t_sna_2i("sna:inside",natom,nmax);
-  wj = t_sna_2d("sna:wj",natom,nmax);
-  rcutij = t_sna_2d("sna:rcutij",natom,nmax);
-  element = t_sna_2i("sna:rcutij",natom,nmax);
-  dedr = t_sna_3d("sna:dedr",natom,nmax,3);
+  inside = t_sna_2i(Kokkos::ViewAllocateWithoutInitializing("sna:inside"),natom,nmax);
+  element = t_sna_2i(Kokkos::ViewAllocateWithoutInitializing("sna:rcutij"),natom,nmax);
+  dedr = t_sna_3d(Kokkos::ViewAllocateWithoutInitializing("sna:dedr"),natom,nmax,3);

 #ifdef LMP_KOKKOS_GPU
  if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
-    // dummy allocation
-    ulisttot = t_sna_3c_ll("sna:ulisttot",1,1,1);
-    ulisttot_re = t_sna_3d_ll("sna:ulisttot_re",idxu_max,nelements,natom);
-    ulisttot_im = t_sna_3d_ll("sna:ulisttot_im",idxu_max,nelements,natom);
-    ulisttot_pack = t_sna_4c_ll("sna:ulisttot_pack",32,idxu_max,nelements,(natom+32-1)/32);
-    ulist = t_sna_3c_ll("sna:ulist",1,1,1);
-    zlist = t_sna_3c_ll("sna:zlist",1,1,1);
-    zlist_pack = t_sna_4c_ll("sna:zlist_pack",32,idxz_max,ndoubles,(natom+32-1)/32);
-    blist = t_sna_3d_ll("sna:blist",idxb_max,ntriples,natom);
-    blist_pack = t_sna_4d_ll("sna:blist_pack",32,idxb_max,ntriples,(natom+32-1)/32);
-    ylist = t_sna_3c_ll("sna:ylist",idxu_max,nelements,natom);
-    ylist_pack_re = t_sna_4d_ll("sna:ylist_pack_re",32,idxu_max,nelements,(natom+32-1)/32);
-    ylist_pack_im = t_sna_4d_ll("sna:ylist_pack_im",32,idxu_max,nelements,(natom+32-1)/32);
-    dulist = t_sna_4c3_ll("sna:dulist",1,1,1);
+
+    cayleyklein = t_sna_2ckp(Kokkos::ViewAllocateWithoutInitializing("sna:cayleyklein"), natom, nmax);
+    ulisttot = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot"),1,1,1); // dummy allocation
+    ulisttot_full = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot"),1,1,1);
+    ulisttot_re = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_re"),idxu_half_max,nelements,natom);
+    ulisttot_im = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_im"),idxu_half_max,nelements,natom);
+    ulisttot_pack = t_sna_4c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_pack"),32,idxu_max,nelements,(natom+32-1)/32);
+    ulist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulist"),1,1,1);
+    zlist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:zlist"),1,1,1);
+    zlist_pack = t_sna_4c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:zlist_pack"),32,idxz_max,ndoubles,(natom+32-1)/32);
+    blist = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:blist"),idxb_max,ntriples,natom);
+    blist_pack = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:blist_pack"),32,idxb_max,ntriples,(natom+32-1)/32);
+    ylist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist"),idxu_half_max,nelements,natom);
+    ylist_pack_re = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist_pack_re"),32,idxu_half_max,nelements,(natom+32-1)/32);
+    ylist_pack_im = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist_pack_im"),32,idxu_half_max,nelements,(natom+32-1)/32);
+    dulist = t_sna_4c3_ll(Kokkos::ViewAllocateWithoutInitializing("sna:dulist"),1,1,1);
  } else {
 #endif
-    ulisttot = t_sna_3c_ll("sna:ulisttot",idxu_max,nelements,natom);
-    ulisttot_re = t_sna_3d_ll("sna:ulisttot_re",1,1,1);
-    ulisttot_im = t_sna_3d_ll("sna:ulisttot_im",1,1,1);
-    ulisttot_pack = t_sna_4c_ll("sna:ulisttot_pack",1,1,1,1);
-    ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax);
-    zlist = t_sna_3c_ll("sna:zlist",idxz_max,ndoubles,natom);
-    zlist_pack = t_sna_4c_ll("sna:zlist_pack",1,1,1,1);
-    blist = t_sna_3d_ll("sna:blist",idxb_max,ntriples,natom);
-    blist_pack = t_sna_4d_ll("sna:blist_pack",1,1,1,1);
-    ylist = t_sna_3c_ll("sna:ylist",idxu_max,nelements,natom);
-    ylist_pack_re = t_sna_4d_ll("sna:ylist_pack_re",1,1,1,1);
-    ylist_pack_im = t_sna_4d_ll("sna:ylist_pack_im",1,1,1,1);
-    dulist = t_sna_4c3_ll("sna:dulist",idxu_max,natom,nmax);
+    rij = t_sna_3d(Kokkos::ViewAllocateWithoutInitializing("sna:rij"),natom,nmax,3);
+    wj = t_sna_2d(Kokkos::ViewAllocateWithoutInitializing("sna:wj"),natom,nmax);
+    rcutij = t_sna_2d(Kokkos::ViewAllocateWithoutInitializing("sna:rcutij"),natom,nmax);
+    ulisttot = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot"),idxu_half_max,nelements,natom);
+    ulisttot_full = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_full"),idxu_max,nelements,natom);
+    ulisttot_re = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_re"),1,1,1);
+    ulisttot_im = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_im"),1,1,1);
+    ulisttot_pack = t_sna_4c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_pack"),1,1,1,1);
+    ulist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulist"),idxu_cache_max,natom,nmax);
+    zlist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:zlist"),idxz_max,ndoubles,natom);
+    zlist_pack = t_sna_4c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:zlist_pack"),1,1,1,1);
+    blist = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:blist"),idxb_max,ntriples,natom);
+    blist_pack = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:blist_pack"),1,1,1,1);
+    ylist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist"),idxu_half_max,nelements,natom);
+    ylist_pack_re = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist_pack_re"),1,1,1,1);
+    ylist_pack_im = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist_pack_im"),1,1,1,1);
+    dulist = t_sna_4c3_ll(Kokkos::ViewAllocateWithoutInitializing("sna:dulist"),idxu_cache_max,natom,nmax);

 #ifdef LMP_KOKKOS_GPU
  }
@ -278,20 +317,111 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
 * GPU routines
 * ----------------------------------------------------------------------*/

+
+/* ----------------------------------------------------------------------
+   Precompute the Cayley-Klein parameters and the derivatives thereof.
+   This routine better exploits parallelism than the GPU ComputeUi and 
+   ComputeFusedDeidrj, which are one warp per atom-neighbor pair.
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void SNAKokkos<DeviceType>::compute_cayley_klein(const int& iatom, const int& jnbor, const double& x, const double& y,
+                                         const double& z, const double& rcut, const double& wj_local)
+{
+  //const double x = rij(iatom,jnbor,0);
+  //const double y = rij(iatom,jnbor,1);
+  //const double z = rij(iatom,jnbor,2);
+  const double rsq = x * x + y * y + z * z;
+  const double r = sqrt(rsq);
+  //const double rcut = rcutij(iatom, jnbor);
+  const double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
+  const double theta0 = (r - rmin0) * rscale0;
+  double sn, cs;
+  sincos(theta0, &sn, &cs);
+  const double z0 = r * cs / sn;
+  const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
+
+  //const double wj_local = wj(iatom, jnbor);
+  double sfac, dsfac; 
+  compute_s_dsfac(r, rcut, sfac, dsfac);
+  sfac *= wj_local;
+  dsfac *= wj_local;
+
+  const double rinv = 1.0 / r;
+  const double ux = x * rinv;
+  const double uy = y * rinv;
+  const double uz = z * rinv;
+
+  const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
+
+  const SNAcomplex a = { z0 * r0inv, -z * r0inv };
+  const SNAcomplex b = { r0inv * y, -r0inv * x };
+
+  const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
+
+  const double dr0invx = dr0invdr * ux;
+  const double dr0invy = dr0invdr * uy;
+  const double dr0invz = dr0invdr * uz;
+
+  const double dz0x = dz0dr * ux;
+  const double dz0y = dz0dr * uy;
+  const double dz0z = dz0dr * uz;
+
+  const SNAcomplex dax = { dz0x * r0inv + z0 * dr0invx, -z * dr0invx };
+  const SNAcomplex day = { dz0y * r0inv + z0 * dr0invy, -z * dr0invy };
+  const SNAcomplex daz = { dz0z * r0inv + z0 * dr0invz, -z * dr0invz - r0inv };
+
+  const SNAcomplex dbx = { y * dr0invx, -x * dr0invx - r0inv };
+  const SNAcomplex dby = { y * dr0invy + r0inv, -x * dr0invy };
+  const SNAcomplex dbz = { y * dr0invz, -x * dr0invz };
+
+  const double dsfacux = dsfac * ux;
+  const double dsfacuy = dsfac * uy;
+  const double dsfacuz = dsfac * uz;
+
+  CayleyKleinPack ckp{};
+  ckp.a = a;
+  ckp.b = b;
+  ckp.da[0] = dax;
+  ckp.db[0] = dbx;
+  ckp.da[1] = day;
+  ckp.db[1] = dby;
+  ckp.da[2] = daz;
+  ckp.db[2] = dbz;
+  ckp.sfac = sfac;
+  ckp.dsfacu[0] = dsfacux;
+  ckp.dsfacu[1] = dsfacuy;
+  ckp.dsfacu[2] = dsfacuz;
+
+  // Yes, this breaks the standard mantra of using SoA
+  // instead of AoS, but it's net fine because of the
+  // one warp per atom/neighbor pair for the recursive
+  // polynomials. There's good L1 reuse, anyway.
+  cayleyklein(iatom, jnbor) = ckp;
+}
+
+/* ----------------------------------------------------------------------
+   Initialize ulisttot with self-energy terms.
+   Ulisttot uses a "half" data layout which takes
+   advantage of the symmetry of the Wigner U matrices.
+------------------------------------------------------------------------- */
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, const int& ielem)
 {
  for (int jelem = 0; jelem < nelements; jelem++) {
    for (int j = 0; j <= twojmax; j++) {
-      const int jju = idxu_block(j);
+      const int jju_half = idxu_half_block(j);

      // Only diagonal elements get initialized
-      // for (int m = 0; m < (j+1)*(j+1); m++)
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)),
+      // Top half only: gets symmetrized by TransformUi
+      // for (int m = 0; m < (j+1)*(j/2+1); m++)
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)),
        [&] (const int m) {

-        const int jjup = jju + m;
+        const int jjup = jju_half + m;

        // if m is on the "diagonal", initialize it with the self energy.
        // Otherwise zero it out
@ -323,40 +453,27 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
  //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb

  // get shared memory offset
-  const int max_m_tile = (twojmax+1)*(twojmax+1);
+  const int max_m_tile = (twojmax+1)*(twojmax/2+1);
  const int team_rank = team.team_rank();
  const int scratch_shift = team_rank * max_m_tile;

-  // double buffer
+  // shared memory double buffer
+  // Each level stores a (j+1) x ((j+3)/2) tile in preparation
+  // for the computation on the next level. This is the "cached"
+  // data layout, which also gets used on the CPU.
  SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
  SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;

-  const double x = rij(iatom,jnbor,0);
-  const double y = rij(iatom,jnbor,1);
-  const double z = rij(iatom,jnbor,2);
+  // Each warp is accessing the same pack: take advantage of
+  // L1 reuse
+  const CayleyKleinPack& ckp = cayleyklein(iatom, jnbor);
+
+  const SNAcomplex a = ckp.a;
+  const SNAcomplex b = ckp.b;
+  const double sfac = ckp.sfac;

-  const double wj_local = wj(iatom, jnbor);
-  const double rcut = rcutij(iatom, jnbor);
  const int jelem = element(iatom, jnbor);

-  const double rsq = x * x + y * y + z * z;
-  const double r = sqrt(rsq);
-
-  const double theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
-  //    theta0 = (r - rmin0) * rscale0;
-  const double cs = cos(theta0);
-  const double sn = sin(theta0);
-  const double z0 = r * cs / sn; // r / tan(theta0)
-
-  // Compute cutoff function
-  const double sfac = compute_sfac(r, rcut) * wj_local;
-
-  // compute Cayley-Klein parameters for unit quaternion,
-  // pack into complex number
-  const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
-  const SNAcomplex a = { r0inv * z0, -r0inv * z };
-  const SNAcomplex b = { r0inv * y, -r0inv * x };
-
  // VMK Section 4.8.2

  // All writes go to global memory and shared memory
@ -367,7 +484,8 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
  });

  for (int j = 1; j <= twojmax; j++) {
-    const int jju = idxu_block[j];
+
+    const int jju = idxu_half_block[j];

    // fill in left side of matrix layer from previous layer

@ -377,8 +495,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
    // for (int mb = 0; 2*mb <= j; mb++)
    const int n_mb = j/2+1;

-    // the last (j / 2) can be avoided due to symmetry
-    const int total_iters = n_ma * n_mb - (j % 2 == 0 ? (j / 2) : 0);
+    const int total_iters = n_ma * n_mb;

    //for (int m = 0; m < total_iters; m++) {
    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
@ -423,25 +540,22 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
      // copy left side to right side with inversion symmetry VMK 4.4(2)
      // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
      // if j is even (-> physical j integer), last element maps to self, skip
-      //if (!(m == total_iters - 1 && j % 2 == 0)) {
-      if (m < total_iters - 1 || j % 2 == 1) {
-        const int sign_factor = (((ma+mb)%2==0)?1:-1);
-        const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
-        const int jjup_flip = jju + jju_shared_flip; // jju+(j+1-mb)*(j+1)-(ma+1);

+      const int sign_factor = (((ma+mb)%2==0)?1:-1);
+      const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);

-        if (sign_factor == 1) {
-          u_accum.im = -u_accum.im;
-        } else {
-          u_accum.re = -u_accum.re;
-        }
-
-        buf2[jju_shared_flip] = u_accum;
-
-        // split re, im to get fully coalesced atomic add
-        Kokkos::atomic_add(&(ulisttot_re(jjup_flip,jelem,iatom)), sfac * u_accum.re);
-        Kokkos::atomic_add(&(ulisttot_im(jjup_flip,jelem,iatom)), sfac * u_accum.im);
+      if (sign_factor == 1) {
+        u_accum.im = -u_accum.im;
+      } else {
+        u_accum.re = -u_accum.re;
      }
+
+      if (j%2==1 && mb+1==n_mb) {
+        buf2[jju_shared_flip] = u_accum;
+      }
+
+      // symmetric part of ulisttot is generated in TransformUi
+
    });
    // In CUDA backend,
    // ThreadVectorRange has a __syncwarp (appropriately masked for
@ -454,8 +568,11 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
  }
 }

+
 /* ----------------------------------------------------------------------
-   compute Zi by summing over products of Ui, GPU version
+   compute Zi by summing over products of Ui,
+   AoSoA data layout to take advantage of coalescing, avoiding warp
+   divergence. GPU version
 ------------------------------------------------------------------------- */

 template<class DeviceType>
@ -525,6 +642,8 @@ void SNAKokkos<DeviceType>::compute_zi(const int& iatom_mod, const int& jjz, con

 /* ----------------------------------------------------------------------
   compute Bi by summing conj(Ui)*Zi
+   AoSoA data layout to take advantage of coalescing, avoiding warp
+   divergence. 
 ------------------------------------------------------------------------- */

 template<class DeviceType>
@ -615,8 +734,9 @@ void SNAKokkos<DeviceType>::compute_bi(const int& iatom_mod, const int& jjb, con


 /* ----------------------------------------------------------------------
-   compute Yi from Ui without storing Zi, looping over zlist indices,
-   GPU version
+   compute Yi from Ui without storing Zi, looping over zlist indices.
+   AoSoA data layout to take advantage of coalescing, avoiding warp
+   divergence. GPU version.
 ------------------------------------------------------------------------- */

 template<class DeviceType>
@ -635,7 +755,7 @@ void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
  const int mb2max = idxz(jjz, 6);
  const int na = idxz(jjz, 7);
  const int nb = idxz(jjz, 8);
-  const int jju = idxz(jjz, 9);
+  const int jju_half = idxz(jjz, 9);

  const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
  //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
@ -677,8 +797,8 @@ void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
      } // end loop over ib

      if (bnorm_flag) {
-        ztmp_r /= j + 1;
-        ztmp_i /= j + 1;
+        ztmp_r /= (j + 1);
+        ztmp_i /= (j + 1);
      }

      // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
@ -710,8 +830,8 @@ void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
          betaj *= (j1 + 1) / (j + 1.0);


-        Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju, elem3, iatom_div)), betaj*ztmp_r);
-        Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju, elem3, iatom_div)), betaj*ztmp_i);
+        Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj*ztmp_r);
+        Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj*ztmp_i);
      } // end loop over elem3
    } // end loop over elem2
  } // end loop over elem1
@ -719,7 +839,7 @@ void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,

 /* ----------------------------------------------------------------------
   Fused calculation of the derivative of Ui w.r.t. atom j
-   and of dEidRj. GPU only.
+   and accumulation into dEidRj. GPU only.
 ------------------------------------------------------------------------- */

 template<class DeviceType>
@ -732,6 +852,9 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
  const int scratch_shift = team_rank * max_m_tile;
  const int jelem = element(iatom, jnbor);

+  // See notes on data layouts for shared memory caching
+  // in `compute_ui`.
+
  // double buffer for ulist
  SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
  SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
@ -740,50 +863,20 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
  SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
  SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;

-  const double x = rij(iatom,jnbor,0);
-  const double y = rij(iatom,jnbor,1);
-  const double z = rij(iatom,jnbor,2);
-  const double rsq = x * x + y * y + z * z;
-  const double r = sqrt(rsq);
-  const double rcut = rcutij(iatom, jnbor);
-  const double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
-  const double theta0 = (r - rmin0) * rscale0;
-  const double cs = cos(theta0);
-  const double sn = sin(theta0);
-  const double z0 = r * cs / sn;
-  const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
+  const CayleyKleinPack& ckp = cayleyklein(iatom, jnbor);

-  const double wj_local = wj(iatom, jnbor);
-  const double sfac = wj_local * compute_sfac(r, rcut);
-  const double dsfac = wj_local * compute_dsfac(r, rcut);
-
-  const double rinv = 1.0 / r;
-
-  // extract a single unit vector
-  const double u = (dir == 0 ? x * rinv : dir == 1 ? y * rinv : z * rinv);
-
-  // Compute Cayley-Klein parameters for unit quaternion
-  const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
-
-  const SNAcomplex a = { r0inv * z0, -r0inv * z };
-  const SNAcomplex b = { r0inv * y, -r0inv * x };
-
-  const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
-  const double dr0inv = dr0invdr * u;
-  const double dz0 = dz0dr * u;
-
-  const SNAcomplex da = { dz0 * r0inv + z0 * dr0inv,
-                              - z * dr0inv + (dir == 2 ? - r0inv : 0.) };
-
-  const SNAcomplex db = { y * dr0inv + (dir==1?r0inv:0.),
-                              -x * dr0inv + (dir==0?-r0inv:0.) };
+  const SNAcomplex a = ckp.a;
+  const SNAcomplex b = ckp.b;
+  const SNAcomplex da = ckp.da[dir];
+  const SNAcomplex db = ckp.db[dir];
+  const double sfac = ckp.sfac;
+  const double dsfacu = ckp.dsfacu[dir]; // dsfac * u

  // Accumulate the full contribution to dedr on the fly
-  const double du_prod = dsfac * u; // chain rule
  const SNAcomplex y_local = ylist(0, jelem, iatom);

  // Symmetry factor of 0.5 b/c 0 element is on diagonal for even j==0
-  double dedr_full_sum = 0.5 * du_prod * y_local.re;
+  double dedr_full_sum = 0.5 * dsfacu * y_local.re;

  // single has a warp barrier at the end
  Kokkos::single(Kokkos::PerThread(team), [=]() {
@ -793,8 +886,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
  });

  for (int j = 1; j <= twojmax; j++) {
-    int jju = idxu_block[j];
-    int jjup = idxu_block[j-1];
+    int jju_half = idxu_half_block[j];

    // flatten the loop over ma,mb

@ -812,11 +904,10 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
      [&] (const int m, double& sum_tmp) {

      // ma fast, mb slow
-      // Equivalent to `int ma = m % n_ma; int mb = m / n_ma;` IF everything's positive.
      const int mb = m / n_ma;
      const int ma = m - mb * n_ma;

-      const int jju_index = jju+m;
+      const int jju_half_index = jju_half+m;

      // Load y_local, apply the symmetry scaling factor
      // The "secret" of the shared memory optimization is it eliminates
@ -824,7 +915,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
      // shared memory and otherwise always writing, making the kernel
      // ultimately compute bound. We take advantage of that by adding
      // some reads back in.
-      auto y_local = ylist(jju_index, jelem, iatom);
+      auto y_local = ylist(jju_half_index, jelem, iatom);
      if (j % 2 == 0 && 2*mb == j) {
        if (ma == mb) { y_local = 0.5*y_local; }
        else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright
@ -870,7 +961,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
      dulist_buf2[jju_shared_idx] = du_accum;

      // Directly accumulate deidrj into sum_tmp
-      const SNAcomplex du_prod = ((dsfac * u)*u_accum) + (sfac*du_accum);
+      const SNAcomplex du_prod = (dsfacu * u_accum) + (sfac * du_accum);
      sum_tmp += du_prod.re * y_local.re + du_prod.im * y_local.im;

      // copy left side to right side with inversion symmetry VMK 4.4(2)
@ -923,8 +1014,9 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
 * ----------------------------------------------------------------------*/

 /* ----------------------------------------------------------------------
- *    compute Ui by summing over neighbors j
- *    ------------------------------------------------------------------------- */
+   Ulisttot uses a "half" data layout which takes
+   advantage of the symmetry of the Wigner U matrices.
+ * ------------------------------------------------------------------------- */

 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
@ -932,11 +1024,11 @@ void SNAKokkos<DeviceType>::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceT
 {
  for (int jelem = 0; jelem < nelements; jelem++) {
    for (int j = 0; j <= twojmax; j++) {
-      const int jju = idxu_block(j);
+      const int jju = idxu_half_block(j);

      // Only diagonal elements get initialized
-      // for (int m = 0; m < (j+1)*(j+1); m++)
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)),
+      // for (int m = 0; m < (j+1)*(j/2+1); m++)
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)),
        [&] (const int m) {

        const int jjup = jju + m;
@ -956,6 +1048,8 @@ void SNAKokkos<DeviceType>::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceT

 /* ----------------------------------------------------------------------
   compute Ui by summing over bispectrum components. CPU only.
+   See comments above compute_uarray_cpu and add_uarraytot for
+   data layout comments.
 ------------------------------------------------------------------------- */

 template<class DeviceType>
@ -1026,10 +1120,10 @@ void SNAKokkos<DeviceType>::compute_zi_cpu(const int& iter)
        int ma2 = ma2max;
        int icga = ma1min * (j2 + 1) + ma2max;
        for(int ia = 0; ia < na; ia++) {
-          suma1_r += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).re -
-                                      ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).im);
-          suma1_i += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).im +
-                                      ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).re);
+          suma1_r += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).re -
+                                      ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).im);
+          suma1_i += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).im +
+                                      ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).re);
          ma1++;
          ma2--;
          icga += j2;
@ -1098,8 +1192,8 @@ void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<Dev
              const int jjz_index = jjz + mb * (j + 1) + ma;
              if (2*mb == j) return;
              sum +=
-                ulisttot(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
-                ulisttot(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
+                ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
+                ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
            },sumzu_temp); // end loop over ma, mb
            sumzu += sumzu_temp;

@ -1113,8 +1207,8 @@ void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<Dev
              const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
              const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
              sum +=
-                ulisttot(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
-                ulisttot(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
+                ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
+                ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
            },sumzu_temp); // end loop over ma
            sumzu += sumzu_temp;

@ -1122,8 +1216,8 @@ void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<Dev
            const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
            const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
            sumzu += 0.5*
-              (ulisttot(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
-               ulisttot(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im);
+              (ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
+               ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im);
          } // end if jeven

          Kokkos::single(Kokkos::PerThread(team), [&] () {
@ -1154,17 +1248,6 @@ void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<Dev

 }

-/* ----------------------------------------------------------------------
-   compute Yi from Ui without storing Zi, looping over zlist indices
------------------------------------------------------------------------- */
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::zero_yi_cpu(const int& idx, const int& iatom, const int& ielem)
-{
-  ylist(idx,ielem,iatom) = {0.0, 0.0};
-}
-
 /* ----------------------------------------------------------------------
   compute Yi from Ui without storing Zi, looping over zlist indices,
   CPU version
@ -1188,7 +1271,7 @@ void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,
  const int mb2max = idxz(jjz, 6);
  const int na = idxz(jjz, 7);
  const int nb = idxz(jjz, 8);
-  const int jju = idxz(jjz, 9);
+  const int jju_half = idxz(jjz, 9);

  const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
  //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
@ -1214,10 +1297,10 @@ void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,
        int icga = ma1min*(j2+1) + ma2max;

        for (int ia = 0; ia < na; ia++) {
-          suma1_r += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).re -
-                                      ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).im);
-          suma1_i += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).im +
-                                      ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).re);
+          suma1_r += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).re -
+                                      ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).im);
+          suma1_i += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).im +
+                                      ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).re);
          ma1++;
          ma2--;
          icga += j2;
@ -1264,8 +1347,8 @@ void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,
        if (!bnorm_flag && j1 > j)
          betaj *= (j1 + 1) / (j + 1.0);

-        Kokkos::atomic_add(&(ylist(jju, elem3, iatom).re), betaj*ztmp_r);
-        Kokkos::atomic_add(&(ylist(jju, elem3, iatom).im), betaj*ztmp_i);
+        Kokkos::atomic_add(&(ylist(jju_half, elem3, iatom).re), betaj*ztmp_r);
+        Kokkos::atomic_add(&(ylist(jju_half, elem3, iatom).im), betaj*ztmp_i);
      } // end loop over elem3
    } // end loop over elem2
  } // end loop over elem1
@ -1274,6 +1357,8 @@ void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,

 /* ----------------------------------------------------------------------
   calculate derivative of Ui w.r.t. atom j
+   see comments above compute_duarray_cpu for comments on the
+   data layout
 ------------------------------------------------------------------------- */

 template<class DeviceType>
@ -1301,6 +1386,10 @@ void SNAKokkos<DeviceType>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy

 /* ----------------------------------------------------------------------
   compute dEidRj, CPU path only.
+   dulist takes advantage of a `cached` data layout, similar to the
+   shared memory layout for the GPU routines, which is efficient for
+   compressing the calculation in compute_duarray_cpu. That said,
+   dulist only uses the "half" data layout part of that structure.
 ------------------------------------------------------------------------- */


@ -1314,14 +1403,18 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy
  //for(int j = 0; j <= twojmax; j++) {
  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1),
      [&] (const int& j, t_scalar3<double>& sum_tmp) {
-    int jju = idxu_block[j];
+    int jju_half = idxu_half_block[j];
+    int jju_cache = idxu_cache_block[j];

    for(int mb = 0; 2*mb < j; mb++)
      for(int ma = 0; ma <= j; ma++) {
-        sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im;
-        sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im;
-        sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im;
-        jju++;
+        sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re +
+                     dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im;
+        sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re + 
+                     dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im;
+        sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re +
+                     dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im;
+        jju_half++; jju_cache++;
      } //end loop over ma mb

    // For j even, handle middle column
@ -1330,16 +1423,22 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy

      int mb = j/2;
      for(int ma = 0; ma < mb; ma++) {
-        sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im;
-        sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im;
-        sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im;
-        jju++;
+        sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re +
+                     dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im;
+        sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re + 
+                     dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im;
+        sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re + 
+                     dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im;
+        jju_half++; jju_cache++;
      }

      //int ma = mb;
-      sum_tmp.x += (dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im)*0.5;
-      sum_tmp.y += (dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im)*0.5;
-      sum_tmp.z += (dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im)*0.5;
+      sum_tmp.x += (dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re +
+                    dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im)*0.5;
+      sum_tmp.y += (dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re + 
+                    dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im)*0.5;
+      sum_tmp.z += (dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re + 
+                    dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im)*0.5;
    } // end if jeven

  },final_sum); // end loop over j
@ -1355,6 +1454,10 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy

 /* ----------------------------------------------------------------------
   add Wigner U-functions for one neighbor to the total
+   ulist is in a "cached" data layout, which is a compressed layout
+   which still keeps the recursive calculation simple. On the other hand
+   `ulisttot` uses a "half" data layout, which fully takes advantage
+   of the symmetry of the Wigner U matrices.
 ------------------------------------------------------------------------- */

 template<class DeviceType>
@ -1364,15 +1467,27 @@ void SNAKokkos<DeviceType>::add_uarraytot(const typename Kokkos::TeamPolicy<Devi
 {
  const double sfac = compute_sfac(r, rcut) * wj;

-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(0)),
-      [&] (const int& i) {
-    Kokkos::atomic_add(&(ulisttot(i,jelem,iatom).re), sfac * ulist(i,iatom,jnbor).re);
-    Kokkos::atomic_add(&(ulisttot(i,jelem,iatom).im), sfac * ulist(i,iatom,jnbor).im);
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1),
+      [&] (const int& j) {
+
+    int jju_half = idxu_half_block[j]; // index into ulisttot
+    int jju_cache = idxu_cache_block[j]; // index into ulist
+
+    int count = 0;
+    for (int mb = 0; 2*mb <= j; mb++) {
+      for (int ma = 0; ma <= j; ma++) {
+        Kokkos::atomic_add(&(ulisttot(jju_half+count, jelem, iatom).re), sfac * ulist(jju_cache+count, iatom, jnbor).re);
+        Kokkos::atomic_add(&(ulisttot(jju_half+count, jelem, iatom).im), sfac * ulist(jju_cache+count, iatom, jnbor).im);
+        count++;
+      }
+    }
  });
 }

 /* ----------------------------------------------------------------------
-   compute Wigner U-functions for one neighbor
+   compute Wigner U-functions for one neighbor.
+   `ulisttot` uses a "cached" data layout, matching the amount of
+   information stored between layers via scratch memory on the GPU path
 ------------------------------------------------------------------------- */

 template<class DeviceType>
@ -1399,8 +1514,8 @@ void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy
  ulist(0,iatom,jnbor).im = 0.0;

  for (int j = 1; j <= twojmax; j++) {
-    int jju = idxu_block[j];
-    int jjup = idxu_block[j-1];
+    const int jju = idxu_cache_block[j];
+    const int jjup = idxu_cache_block[j-1];

    // fill in left side of matrix layer from previous layer

@ -1434,37 +1549,37 @@ void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy
          (b_r * ulist(jjup_index,iatom,jnbor).im -
           b_i * ulist(jjup_index,iatom,jnbor).re);
      }
-    });

-    // copy left side to right side with inversion symmetry VMK 4.4(2)
-    // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
+      // copy left side to right side with inversion symmetry VMK 4.4(2)
+      // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))

-    jju = idxu_block[j];
-    jjup = jju+(j+1)*(j+1)-1;
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
-        [&] (const int& mb) {
-//    for (int mb = 0; 2*mb <= j; mb++) {
-      int mbpar = (mb)%2==0?1:-1;
-      int mapar = mbpar;
-      for (int ma = 0; ma <= j; ma++) {
-        const int jju_index = jju+mb*(j+1)+ma;
-        const int jjup_index = jjup-mb*(j+1)-ma;
-        if (mapar == 1) {
-          ulist(jjup_index,iatom,jnbor).re = ulist(jju_index,iatom,jnbor).re;
-          ulist(jjup_index,iatom,jnbor).im = -ulist(jju_index,iatom,jnbor).im;
-        } else {
-          ulist(jjup_index,iatom,jnbor).re = -ulist(jju_index,iatom,jnbor).re;
-          ulist(jjup_index,iatom,jnbor).im = ulist(jju_index,iatom,jnbor).im;
+      // Only need to add one symmetrized row for convenience
+      // Symmetry gets "unfolded" in accumulating ulisttot
+      if (j%2==1 && mb==(j/2)) {
+        const int mbpar = (mb)%2==0?1:-1;
+        int mapar = mbpar;
+        for (int ma = 0; ma <= j; ma++) {
+          const int jju_index = jju + mb*(j+1) + ma;
+          const int jjup_index = jju + (j+1-mb)*(j+1)-(ma+1);
+          if (mapar == 1) {
+            ulist(jjup_index,iatom,jnbor).re = ulist(jju_index,iatom,jnbor).re;
+            ulist(jjup_index,iatom,jnbor).im = -ulist(jju_index,iatom,jnbor).im;
+          } else {
+            ulist(jjup_index,iatom,jnbor).re = -ulist(jju_index,iatom,jnbor).re;
+            ulist(jjup_index,iatom,jnbor).im = ulist(jju_index,iatom,jnbor).im;
+          }
+          mapar = -mapar;
        }
-        mapar = -mapar;
      }
    });
+
  }
 }

 /* ----------------------------------------------------------------------
   compute derivatives of Wigner U-functions for one neighbor
   see comments in compute_uarray_cpu()
+   Uses same cached data layout of ulist
 ------------------------------------------------------------------------- */

 template<class DeviceType>
@ -1524,8 +1639,8 @@ double r0inv;
  dulist(0,iatom,jnbor,2).im = 0.0;

  for (int j = 1; j <= twojmax; j++) {
-    int jju = idxu_block[j];
-    int jjup = idxu_block[j-1];
+    int jju = idxu_cache_block[j];
+    int jjup = idxu_cache_block[j-1];
    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
        [&] (const int& mb) {
    //for (int mb = 0; 2*mb <= j; mb++) {
@ -1568,33 +1683,32 @@ double r0inv;
                       b_i * dulist(jjup_index,iatom,jnbor,k).re);
        }
      }
-    });

-    // copy left side to right side with inversion symmetry VMK 4.4(2)
-    // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
+      // Only need to add one symmetrized row for convenience
+      // Symmetry gets "unfolded" during the dedr accumulation

-    jju = idxu_block[j];
-    jjup = jju+(j+1)*(j+1)-1;
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
-        [&] (const int& mb) {
-//    for (int mb = 0; 2*mb <= j; mb++) {
-      int mbpar = (mb)%2==0?1:-1;
-      int mapar = mbpar;
-      for (int ma = 0; ma <= j; ma++) {
-        const int jju_index = jju+mb*(j+1)+ma;
-        const int jjup_index = jjup-mb*(j+1)-ma;
-        if (mapar == 1) {
-          for (int k = 0; k < 3; k++) {
-            dulist(jjup_index,iatom,jnbor,k).re = dulist(jju_index,iatom,jnbor,k).re;
-            dulist(jjup_index,iatom,jnbor,k).im = -dulist(jju_index,iatom,jnbor,k).im;
-          }
-        } else {
-          for (int k = 0; k < 3; k++) {
-            dulist(jjup_index,iatom,jnbor,k).re = -dulist(jju_index,iatom,jnbor,k).re;
-            dulist(jjup_index,iatom,jnbor,k).im = dulist(jju_index,iatom,jnbor,k).im;
+      // copy left side to right side with inversion symmetry VMK 4.4(2)
+      // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
+
+      if (j%2==1 && mb==(j/2)) {
+        const int mbpar = (mb)%2==0?1:-1;
+        int mapar = mbpar;
+        for (int ma = 0; ma <= j; ma++) {
+          const int jju_index = jju+mb*(j+1)+ma;
+          const int jjup_index = jju+(mb+2)*(j+1)-(ma+1);
+          if (mapar == 1) {
+            for (int k = 0; k < 3; k++) {
+              dulist(jjup_index,iatom,jnbor,k).re = dulist(jju_index,iatom,jnbor,k).re;
+              dulist(jjup_index,iatom,jnbor,k).im = -dulist(jju_index,iatom,jnbor,k).im;
+            }
+          } else {
+            for (int k = 0; k < 3; k++) {
+              dulist(jjup_index,iatom,jnbor,k).re = -dulist(jju_index,iatom,jnbor,k).re;
+              dulist(jjup_index,iatom,jnbor,k).im = dulist(jju_index,iatom,jnbor,k).im;
+            }
          }
+          mapar = -mapar;
        }
-        mapar = -mapar;
      }
    });
  }
@ -1605,8 +1719,11 @@ double r0inv;
  sfac *= wj;
  dsfac *= wj;

+  // Even though we fill out a full "cached" data layout above,
+  // we only need the "half" data for the accumulation into dedr.
+  // Thus we skip updating any unnecessary data.
  for (int j = 0; j <= twojmax; j++) {
-    int jju = idxu_block[j];
+    int jju = idxu_cache_block[j];
    for (int mb = 0; 2*mb <= j; mb++)
      for (int ma = 0; ma <= j; ma++) {
        dulist(jju,iatom,jnbor,0).re = dsfac * ulist(jju,iatom,jnbor).re * ux +
@ -1980,6 +2097,24 @@ double SNAKokkos<DeviceType>::compute_dsfac(double r, double rcut)
  return 0.0;
 }

+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void SNAKokkos<DeviceType>::compute_s_dsfac(const double r, const double rcut, double& sfac, double& dsfac) {
+  if (switch_flag == 0) { sfac = 0.; dsfac = 0.; }
+  else if (switch_flag == 1) {
+    if (r <= rmin0) { sfac = 1.0; dsfac = 0.0; }
+    else if (r > rcut) { sfac = 0.; dsfac = 0.; }
+    else {
+      const double rcutfac = MY_PI / (rcut - rmin0);
+      double sn, cs;
+      sincos((r - rmin0) * rcutfac, &sn, &cs);
+      sfac = 0.5 * (cs + 1.0);
+      dsfac = -0.5 * sn * rcutfac;
+
+    }
+  } else { sfac = 0.; dsfac = 0.; }
+}
+
 /* ---------------------------------------------------------------------- */

 // efficient complex FMA (i.e., y += a x)
@ -2031,30 +2166,31 @@ double SNAKokkos<DeviceType>::memory_usage()

 #ifdef LMP_KOKKOS_GPU
  if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
-    int natom_pad = ((natom + 32 - 1) / 32) * 32; // for AoSoA layouts
+    
+    auto natom_pad = (natom+32-1)/32;

-    bytes += natom * idxu_max * nelements * sizeof(double);          // ulisttot_re
-    bytes += natom * idxu_max * nelements * sizeof(double);          // ulisttot_im
+    bytes += natom * idxu_half_max * nelements * sizeof(double);     // ulisttot_re
+    bytes += natom * idxu_half_max * nelements * sizeof(double);     // ulisttot_im
    bytes += natom_pad * idxu_max * nelements * sizeof(double) * 2;  // ulisttot_pack

    bytes += natom_pad * idxz_max * ndoubles * sizeof(double) * 2;   // zlist_pack
    bytes += natom_pad * idxb_max * ntriples * sizeof(double);       // blist_pack

-    bytes += natom_pad * idxu_max * nelements * sizeof(double);      // ylist_pack_re
-    bytes += natom_pad * idxu_max * nelements * sizeof(double);      // ylist_pack_im
-    bytes += natom * idxu_max * nelements * sizeof(double) * 2;      // ylist
+    bytes += natom_pad * idxu_half_max * nelements * sizeof(double); // ylist_pack_re
+    bytes += natom_pad * idxu_half_max * nelements * sizeof(double); // ylist_pack_im
+    bytes += natom * idxu_half_max * nelements * sizeof(double) * 2; // ylist
  } else {
 #endif

-    bytes += natom * nmax * idxu_max * sizeof(double) * 2;           // ulist
-    bytes += natom * idxu_max * nelements * sizeof(double) * 2;      // ulisttot
+    bytes += natom * nmax * idxu_cache_max * sizeof(double) * 2;     // ulist
+    bytes += natom * idxu_half_max * nelements * sizeof(double) * 2; // ulisttot

    bytes += natom * idxz_max * ndoubles * sizeof(double) * 2;       // zlist
    bytes += natom * idxb_max * ntriples * sizeof(double);           // blist

-    bytes += natom * idxu_max * nelements * sizeof(double) * 2;      // ylist
+    bytes += natom * idxu_half_max * nelements * sizeof(double) * 2; // ylist

-    bytes += natom * nmax * idxu_max * 3 * sizeof(double) * 2;       // dulist
+    bytes += natom * nmax * idxu_cache_max * 3 * sizeof(double) * 2; // dulist
 #ifdef LMP_KOKKOS_GPU
  }
 #endif
@ -2063,6 +2199,8 @@ double SNAKokkos<DeviceType>::memory_usage()

  bytes += jdim * jdim * jdim * sizeof(int);             // idxcg_block
  bytes += jdim * sizeof(int);                           // idxu_block
+  bytes += jdim * sizeof(int);                           // idxu_half_block
+  bytes += jdim * sizeof(int);                           // idxu_cache_block
  bytes += jdim * jdim * jdim * sizeof(int);             // idxz_block
  bytes += jdim * jdim * jdim * sizeof(int);             // idxb_block