From 85999fc4a7a8060b974e93b54486e60110ad2baa Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Fri, 7 Jun 2019 16:31:31 -0600
Subject: [PATCH] Restore original compute_h in fix_qeq_reax_kokkos

---
 src/KOKKOS/fix_qeq_reax_kokkos.cpp | 104 ++++++++++++++++++++++++-----
 src/KOKKOS/fix_qeq_reax_kokkos.h   |  15 ++++-
 2 files changed, 100 insertions(+), 19 deletions(-)
diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.cpp b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
index 4cc6d1f219..1351c2138d 100644
--- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp
+++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
@@ -227,26 +227,32 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
 
   // compute_H
 
-  Kokkos::deep_copy(d_mfill_offset,0);
+  if (lmp->kokkos->ngpus == 0) { // CPU
+    if (neighflag == FULL) {
+      FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(this);
+      Kokkos::parallel_scan(inum,computeH_functor);
+    } else { // HALF and HALFTHREAD are the same
+      FixQEqReaxKokkosComputeHFunctor<DeviceType, HALF> computeH_functor(this);
+      Kokkos::parallel_scan(inum,computeH_functor);
+    }
+  } else { // GPU, use teams
+    Kokkos::deep_copy(d_mfill_offset,0);
 
-  int vector_length = 32;
-  int atoms_per_team = 4;
-  int num_teams = inum / atoms_per_team + (inum % atoms_per_team ? 1 : 0);
+    int vector_length = 32;
+    int atoms_per_team = 4;
+    int num_teams = inum / atoms_per_team + (inum % atoms_per_team ? 1 : 0);
 
-  Kokkos::TeamPolicy<DeviceType> policy(num_teams, atoms_per_team,
-                                        vector_length);
-  if (neighflag == FULL) {
-    FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(
-        this, atoms_per_team, vector_length);
-    Kokkos::parallel_for(policy, computeH_functor);
-  } else if (neighflag == HALF) {
-    FixQEqReaxKokkosComputeHFunctor<DeviceType, HALF> computeH_functor(
-        this, atoms_per_team, vector_length);
-    Kokkos::parallel_for(policy, computeH_functor);
-  } else {
-    FixQEqReaxKokkosComputeHFunctor<DeviceType, HALFTHREAD> computeH_functor(
-        this, atoms_per_team, vector_length);
-    Kokkos::parallel_for(policy, computeH_functor);
+    Kokkos::TeamPolicy<DeviceType> policy(num_teams, atoms_per_team,
+                                          vector_length);
+    if (neighflag == FULL) {
+      FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(
+          this, atoms_per_team, vector_length);
+      Kokkos::parallel_for(policy, computeH_functor);
+    } else { // HALF and HALFTHREAD are the same
+      FixQEqReaxKokkosComputeHFunctor<DeviceType, HALF> computeH_functor(
+          this, atoms_per_team, vector_length);
+      Kokkos::parallel_for(policy, computeH_functor);
+    }
   }
 
   // init_matvec
@@ -401,6 +407,68 @@ void FixQEqReaxKokkos<DeviceType>::zero_item(int ii) const
 
 /* ---------------------------------------------------------------------- */
 
+template<class DeviceType>
+template <int NEIGHFLAG>
+KOKKOS_INLINE_FUNCTION
+void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const bool &final) const
+{
+  const int i = d_ilist[ii];
+  int j,jj,jtype;
+
+  if (mask[i] & groupbit) {
+
+    const X_FLOAT xtmp = x(i,0);
+    const X_FLOAT ytmp = x(i,1);
+    const X_FLOAT ztmp = x(i,2);
+    const int itype = type(i);
+    const tagint itag = tag(i);
+    const int jnum = d_numneigh[i];
+    if (final)
+      d_firstnbr[i] = m_fill;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = d_neighbors(i,jj);
+      j &= NEIGHMASK;
+      jtype = type(j);
+
+      const X_FLOAT delx = x(j,0) - xtmp;
+      const X_FLOAT dely = x(j,1) - ytmp;
+      const X_FLOAT delz = x(j,2) - ztmp;
+
+      if (NEIGHFLAG != FULL) {
+        // skip half of the interactions
+        const tagint jtag = tag(j);
+        if (j >= nlocal) {
+          if (itag > jtag) {
+            if ((itag+jtag) % 2 == 0) continue;
+          } else if (itag < jtag) {
+            if ((itag+jtag) % 2 == 1) continue;
+          } else {
+            if (x(j,2) < ztmp) continue;
+            if (x(j,2) == ztmp && x(j,1)  < ytmp) continue;
+            if (x(j,2) == ztmp && x(j,1) == ytmp && x(j,0) < xtmp) continue;
+          }
+        }
+      }
+
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+      if (rsq > cutsq) continue;
+
+      if (final) {
+        const F_FLOAT r = sqrt(rsq);
+        d_jlist(m_fill) = j;
+        const F_FLOAT shldij = d_shield(itype,jtype);
+        d_val(m_fill) = calculate_H_k(r,shldij);
+      }
+      m_fill++;
+    }
+    if (final)
+      d_numnbrs[i] = m_fill - d_firstnbr[i];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
 // Calculate Qeq matrix H where H is a sparse matrix and H[i][j] represents the electrostatic interaction coefficients on atom-i with atom-j
 // d_val     - contains the non-zero entries of sparse matrix H
 // d_numnbrs - d_numnbrs[i] contains the # of non-zero entries in the i-th row of H (which also represents the # of neighbor atoms with electrostatic interaction coefficients with atom-i)
diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.h b/src/KOKKOS/fix_qeq_reax_kokkos.h
index d4132b47d2..cd69aa9283 100644
--- a/src/KOKKOS/fix_qeq_reax_kokkos.h
+++ b/src/KOKKOS/fix_qeq_reax_kokkos.h
@@ -53,6 +53,10 @@ class FixQEqReaxKokkos : public FixQEqReax {
   KOKKOS_INLINE_FUNCTION
   void zero_item(int) const;
 
+  template<int NEIGHFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void compute_h_item(int, int &, const bool &) const;
+
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
   void compute_h_team(const typename Kokkos::TeamPolicy <DeviceType> ::member_type &team, int, int) const;
@@ -151,7 +155,6 @@ class FixQEqReaxKokkos : public FixQEqReax {
   int allocated_flag;
   int need_dup;
 
-  DAT::tdual_int_scalar k_mfill_offset;
   typename AT::t_int_scalar d_mfill_offset;
 
   typedef Kokkos::DualView<int***,DeviceType> tdual_int_1d;
@@ -254,9 +257,14 @@ struct FixQEqReaxKokkosMatVecFunctor  {
 template <class DeviceType, int NEIGHFLAG>
 struct FixQEqReaxKokkosComputeHFunctor {
   int atoms_per_team, vector_length;
+  typedef int value_type;
   typedef Kokkos::ScratchMemorySpace<DeviceType> scratch_space;
   FixQEqReaxKokkos<DeviceType> c;
 
+  FixQEqReaxKokkosComputeHFunctor(FixQEqReaxKokkos<DeviceType>* c_ptr):c(*c_ptr) {
+    c.cleanup_copy();
+  };
+
   FixQEqReaxKokkosComputeHFunctor(FixQEqReaxKokkos<DeviceType> *c_ptr,
                                   int _atoms_per_team, int _vector_length)
       : c(*c_ptr), atoms_per_team(_atoms_per_team),
@@ -264,6 +272,11 @@ struct FixQEqReaxKokkosComputeHFunctor {
     c.cleanup_copy();
   };
 
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int ii, int &m_fill, const bool &final) const {
+    c.template compute_h_item<NEIGHFLAG>(ii,m_fill,final);
+  }
+
   KOKKOS_INLINE_FUNCTION
   void operator()(
       const typename Kokkos::TeamPolicy<DeviceType>::member_type &team) const {