Merge pull request #1496 from akkamesh/enh-ext-reaxc

reaxc/qeq optimization - using kokkos hierarchical parallelism
2019-06-10 21:37:12 -04:00 · 2019-06-10 21:37:12 -04:00 · fe29572737
parent 8d985e53f4 9421466f57
commit fe29572737
12 changed files with 332 additions and 42 deletions
--- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp
+++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
@ -12,7 +12,8 @@
 ------------------------------------------------------------------------- */

 /* ----------------------------------------------------------------------
-   Contributing author: Ray Shan (SNL), Stan Moore (SNL)
+   Contributing authors: Ray Shan (SNL), Stan Moore (SNL),
+                          Kamesh Arumugam (NVIDIA)
 ------------------------------------------------------------------------- */

 #include <cmath>
@ -68,6 +69,8 @@ FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) :
  memory->destroy(s_hist);
  memory->destroy(t_hist);
  grow_arrays(atom->nmax);
+
+  d_mfill_offset = typename AT::t_int_scalar("qeq/kk:mfill_offset");
 }

 /* ---------------------------------------------------------------------- */
@ -217,17 +220,46 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
  copymode = 1;

  // allocate
+
  allocate_array();

  // get max number of neighbor
+
  if (!allocated_flag || update->ntimestep == neighbor->lastcall)
    allocate_matrix();

  // compute_H
-  FixQEqReaxKokkosComputeHFunctor<DeviceType> computeH_functor(this);
-  Kokkos::parallel_scan(inum,computeH_functor);
+
+  if (lmp->kokkos->ngpus == 0) { // CPU
+    if (neighflag == FULL) {
+      FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(this);
+      Kokkos::parallel_scan(inum,computeH_functor);
+    } else { // HALF and HALFTHREAD are the same
+      FixQEqReaxKokkosComputeHFunctor<DeviceType, HALF> computeH_functor(this);
+      Kokkos::parallel_scan(inum,computeH_functor);
+    }
+  } else { // GPU, use teams
+    Kokkos::deep_copy(d_mfill_offset,0);
+
+    int vector_length = 32;
+    int atoms_per_team = 4;
+    int num_teams = inum / atoms_per_team + (inum % atoms_per_team ? 1 : 0);
+
+    Kokkos::TeamPolicy<DeviceType> policy(num_teams, atoms_per_team,
+                                          vector_length);
+    if (neighflag == FULL) {
+      FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(
+          this, atoms_per_team, vector_length);
+      Kokkos::parallel_for(policy, computeH_functor);
+    } else { // HALF and HALFTHREAD are the same
+      FixQEqReaxKokkosComputeHFunctor<DeviceType, HALF> computeH_functor(
+          this, atoms_per_team, vector_length);
+      Kokkos::parallel_for(policy, computeH_functor);
+    }
+  }

  // init_matvec
+
  k_s_hist.template sync<DeviceType>();
  k_t_hist.template sync<DeviceType>();
  FixQEqReaxKokkosMatVecFunctor<DeviceType> matvec_functor(this);
@ -257,12 +289,15 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
    ndup_o = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated> (d_o);

  // 1st cg solve over b_s, s
+
  cg_solve1();

  // 2nd cg solve over b_t, t
+
  cg_solve2();

  // calculate_Q();
+
  calculate_q();
  k_s_hist.template modify<DeviceType>();
  k_t_hist.template modify<DeviceType>();
@ -273,6 +308,7 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
    allocated_flag = 1;

  // free duplicated memory
+
  if (need_dup)
    dup_o = decltype(dup_o)();

@ -377,6 +413,7 @@ void FixQEqReaxKokkos<DeviceType>::zero_item(int ii) const
 /* ---------------------------------------------------------------------- */

 template<class DeviceType>
+template <int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
 void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const bool &final) const
 {
@ -403,7 +440,7 @@ void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const boo
      const X_FLOAT dely = x(j,1) - ytmp;
      const X_FLOAT delz = x(j,2) - ztmp;

-      if (neighflag != FULL) {
+      if (NEIGHFLAG != FULL) {
        // skip half of the interactions
        const tagint jtag = tag(j);
        if (j >= nlocal) {
@ -437,6 +474,217 @@ void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const boo

 /* ---------------------------------------------------------------------- */

+// Calculate Qeq matrix H where H is a sparse matrix and H[i][j] represents the electrostatic interaction coefficients on atom-i with atom-j
+// d_val     - contains the non-zero entries of sparse matrix H
+// d_numnbrs - d_numnbrs[i] contains the # of non-zero entries in the i-th row of H (which also represents the # of neighbor atoms with electrostatic interaction coefficients with atom-i)
+// d_firstnbr- d_firstnbr[i] contains the beginning index from where the H matrix entries corresponding to row-i is stored in d_val
+// d_jlist   - contains the column index corresponding to each entry in d_val
+
+template <class DeviceType>
+template <int NEIGHFLAG>
+void FixQEqReaxKokkos<DeviceType>::compute_h_team(
+    const typename Kokkos::TeamPolicy<DeviceType>::member_type &team,
+    int atoms_per_team, int vector_length) const {
+
+  // scratch space setup
+  Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      s_ilist(team.team_shmem(), atoms_per_team);
+  Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      s_numnbrs(team.team_shmem(), atoms_per_team);
+  Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      s_firstnbr(team.team_shmem(), atoms_per_team);
+
+  Kokkos::View<int **, Kokkos::ScratchMemorySpace<DeviceType>,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      s_jtype(team.team_shmem(), atoms_per_team, vector_length);
+  Kokkos::View<int **, Kokkos::ScratchMemorySpace<DeviceType>,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      s_jlist(team.team_shmem(), atoms_per_team, vector_length);
+  Kokkos::View<F_FLOAT **, Kokkos::ScratchMemorySpace<DeviceType>,
+               Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+      s_r(team.team_shmem(), atoms_per_team, vector_length);
+
+  // team of threads work on atoms with index in [firstatom, lastatom)
+  int firstatom = team.league_rank() * atoms_per_team;
+  int lastatom =
+      (firstatom + atoms_per_team < inum) ? (firstatom + atoms_per_team) : inum;
+
+  // kokkos-thread-0 is used to load info from global memory into scratch space
+  if (team.team_rank() == 0) {
+
+    // copy atom indices from d_ilist[firstatom:lastatom] to scratch space s_ilist[0:atoms_per_team]
+    // copy # of neighbor atoms for all the atoms with indices in d_ilist[firstatom:lastatom] from d_numneigh to scratch space s_numneigh[0:atoms_per_team]
+    // calculate total number of neighbor atoms for all atoms assigned to the current team of threads (Note - Total # of neighbor atoms here provides the
+    // upper bound space requirement to store the H matrix values corresponding to the atoms with indices in d_ilist[firstatom:lastatom])
+
+    Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, atoms_per_team),
+                          [&](const int &idx, int &totalnbrs, bool final) {
+                            int ii = firstatom + idx;
+
+                            if (ii < inum) {
+                              const int i = d_ilist[ii];
+                              int jnum = d_numneigh[i];
+
+                              if (final) {
+                                s_ilist[idx] = i;
+                                s_numnbrs[idx] = jnum;
+                                s_firstnbr[idx] = totalnbrs;
+                              }
+                              totalnbrs += jnum;
+                            } else {
+                              s_numnbrs[idx] = 0;
+                            }
+                          });
+  }
+
+  // barrier ensures that the data moved to scratch space is visible to all the
+  // threads of the corresponding team
+  team.team_barrier();
+
+  // calculate the global memory offset from where the H matrix values to be
+  // calculated by the current team will be stored in d_val
+  int team_firstnbr_idx = 0;
+  Kokkos::single(Kokkos::PerTeam(team),
+                 [=](int &val) {
+                   int totalnbrs = s_firstnbr[lastatom - firstatom - 1] +
+                                   s_numnbrs[lastatom - firstatom - 1];
+                   val = Kokkos::atomic_fetch_add(&d_mfill_offset(), totalnbrs);
+                 },
+                 team_firstnbr_idx);
+
+  // map the H matrix computation of each atom to kokkos-thread (one atom per
+  // kokkos-thread) neighbor computation for each atom is assigned to vector
+  // lanes of the corresponding thread
+  Kokkos::parallel_for(
+      Kokkos::TeamThreadRange(team, atoms_per_team), [&](const int &idx) {
+        int ii = firstatom + idx;
+
+        if (ii < inum) {
+          const int i = s_ilist[idx];
+
+          if (mask[i] & groupbit) {
+            const X_FLOAT xtmp = x(i, 0);
+            const X_FLOAT ytmp = x(i, 1);
+            const X_FLOAT ztmp = x(i, 2);
+            const int itype = type(i);
+            const tagint itag = tag(i);
+            const int jnum = s_numnbrs[idx];
+
+            // calculate the write-offset for atom-i's first neighbor
+            int atomi_firstnbr_idx = team_firstnbr_idx + s_firstnbr[idx];
+            Kokkos::single(Kokkos::PerThread(team),
+                           [&]() { d_firstnbr[i] = atomi_firstnbr_idx; });
+
+            // current # of neighbor atoms with non-zero electrostatic
+            // interaction coefficients with atom-i which represents the # of
+            // non-zero elements in row-i of H matrix
+            int atomi_nbrs_inH = 0;
+
+            // calculate H matrix values corresponding to atom-i where neighbors
+            // are processed in batches and the batch size is vector_length
+            for (int jj_start = 0; jj_start < jnum; jj_start += vector_length) {
+
+              int atomi_nbr_writeIdx = atomi_firstnbr_idx + atomi_nbrs_inH;
+
+              // count the # of neighbor atoms with non-zero electrostatic
+              // interaction coefficients with atom-i in the current batch
+              int atomi_nbrs_curbatch = 0;
+
+              // compute rsq, jtype, j and store in scratch space which is
+              // reused later
+              Kokkos::parallel_reduce(
+                  Kokkos::ThreadVectorRange(team, vector_length),
+                  [&](const int &idx, int &m_fill) {
+                    const int jj = jj_start + idx;
+
+                    // initialize: -1 represents no interaction with atom-j
+                    // where j = d_neighbors(i,jj)
+                    s_jlist(team.team_rank(), idx) = -1;
+
+                    if (jj < jnum) {
+                      int j = d_neighbors(i, jj);
+                      j &= NEIGHMASK;
+                      const int jtype = type(j);
+
+                      const X_FLOAT delx = x(j, 0) - xtmp;
+                      const X_FLOAT dely = x(j, 1) - ytmp;
+                      const X_FLOAT delz = x(j, 2) - ztmp;
+
+                      // valid nbr interaction
+                      bool valid = true;
+                      if (NEIGHFLAG != FULL) {
+                        // skip half of the interactions
+                        const tagint jtag = tag(j);
+                        if (j >= nlocal) {
+                          if (itag > jtag) {
+                            if ((itag + jtag) % 2 == 0)
+                              valid = false;
+                          } else if (itag < jtag) {
+                            if ((itag + jtag) % 2 == 1)
+                              valid = false;
+                          } else {
+                            if (x(j, 2) < ztmp)
+                              valid = false;
+                            if (x(j, 2) == ztmp && x(j, 1) < ytmp)
+                              valid = false;
+                            if (x(j, 2) == ztmp && x(j, 1) == ytmp &&
+                                x(j, 0) < xtmp)
+                              valid = false;
+                          }
+                        }
+                      }
+
+                      const F_FLOAT rsq =
+                          delx * delx + dely * dely + delz * delz;
+                      if (rsq > cutsq)
+                        valid = false;
+
+                      if (valid) {
+                        s_jlist(team.team_rank(), idx) = j;
+                        s_jtype(team.team_rank(), idx) = jtype;
+                        s_r(team.team_rank(), idx) = sqrt(rsq);
+                        m_fill++;
+                      }
+                    }
+                  },
+                  atomi_nbrs_curbatch);
+
+              // write non-zero entries of H to global memory
+              Kokkos::parallel_scan(
+                  Kokkos::ThreadVectorRange(team, vector_length),
+                  [&](const int &idx, int &m_fill, bool final) {
+                    int j = s_jlist(team.team_rank(), idx);
+                    if (final) {
+                      if (j != -1) {
+                        const int jtype = s_jtype(team.team_rank(), idx);
+                        const F_FLOAT r = s_r(team.team_rank(), idx);
+                        const F_FLOAT shldij = d_shield(itype, jtype);
+
+                        d_jlist[atomi_nbr_writeIdx + m_fill] = j;
+                        d_val[atomi_nbr_writeIdx + m_fill] =
+                            calculate_H_k(r, shldij);
+                      }
+                    }
+
+                    if (j != -1) {
+                      m_fill++;
+                    }
+                  });
+              atomi_nbrs_inH += atomi_nbrs_curbatch;
+            }
+
+            Kokkos::single(Kokkos::PerThread(team),
+                           [&]() { d_numnbrs[i] = atomi_nbrs_inH; });
+          }
+        }
+      });
+}
+
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 double FixQEqReaxKokkos<DeviceType>::calculate_H_k(const F_FLOAT &r, const F_FLOAT &shld) const
--- a/src/KOKKOS/fix_qeq_reax_kokkos.h
+++ b/src/KOKKOS/fix_qeq_reax_kokkos.h
@ -53,9 +53,14 @@ class FixQEqReaxKokkos : public FixQEqReax {
  KOKKOS_INLINE_FUNCTION
  void zero_item(int) const;

+  template<int NEIGHFLAG>
  KOKKOS_INLINE_FUNCTION
  void compute_h_item(int, int &, const bool &) const;

+  template<int NEIGHFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void compute_h_team(const typename Kokkos::TeamPolicy <DeviceType> ::member_type &team, int, int) const;
+
  KOKKOS_INLINE_FUNCTION
  void matvec_item(int) const;

@ -150,6 +155,8 @@ class FixQEqReaxKokkos : public FixQEqReax {
  int allocated_flag;
  int need_dup;

+  typename AT::t_int_scalar d_mfill_offset;
+
  typedef Kokkos::DualView<int***,DeviceType> tdual_int_1d;
  Kokkos::DualView<params_qeq*,Kokkos::LayoutRight,DeviceType> k_params;
  typename Kokkos::DualView<params_qeq*, Kokkos::LayoutRight,DeviceType>::t_dev_const params;
@ -247,16 +254,51 @@ struct FixQEqReaxKokkosMatVecFunctor  {
  }
 };

-template <class DeviceType>
-struct FixQEqReaxKokkosComputeHFunctor  {
-  typedef DeviceType  device_type ;
+template <class DeviceType, int NEIGHFLAG>
+struct FixQEqReaxKokkosComputeHFunctor {
+  int atoms_per_team, vector_length;
+  typedef int value_type;
+  typedef Kokkos::ScratchMemorySpace<DeviceType> scratch_space;
  FixQEqReaxKokkos<DeviceType> c;
+
  FixQEqReaxKokkosComputeHFunctor(FixQEqReaxKokkos<DeviceType>* c_ptr):c(*c_ptr) {
    c.cleanup_copy();
  };
+
+  FixQEqReaxKokkosComputeHFunctor(FixQEqReaxKokkos<DeviceType> *c_ptr,
+                                  int _atoms_per_team, int _vector_length)
+      : c(*c_ptr), atoms_per_team(_atoms_per_team),
+        vector_length(_vector_length) {
+    c.cleanup_copy();
+  };
+
  KOKKOS_INLINE_FUNCTION
  void operator()(const int ii, int &m_fill, const bool &final) const {
-    c.compute_h_item(ii,m_fill,final);
+    c.template compute_h_item<NEIGHFLAG>(ii,m_fill,final);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(
+      const typename Kokkos::TeamPolicy<DeviceType>::member_type &team) const {
+    c.template compute_h_team<NEIGHFLAG>(team, atoms_per_team, vector_length);
+  }
+
+  size_t team_shmem_size(int team_size) const {
+    size_t shmem_size =
+        Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
+            atoms_per_team) + // s_ilist
+        Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
+            atoms_per_team) + // s_numnbrs
+        Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
+            atoms_per_team) + // s_firstnbr
+        Kokkos::View<int **, scratch_space, Kokkos::MemoryUnmanaged>::
+            shmem_size(atoms_per_team, vector_length) + // s_jtype
+        Kokkos::View<int **, scratch_space, Kokkos::MemoryUnmanaged>::
+            shmem_size(atoms_per_team, vector_length) + // s_j
+        Kokkos::View<F_FLOAT **, scratch_space,
+                     Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team,
+                                                          vector_length); // s_r
+    return shmem_size;
  }
 };

--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@ -78,9 +78,9 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)

  // process any command-line args that invoke Kokkos settings

-  ngpu = 0;
+  ngpus = 0;
  int device = 0;
-  num_threads = 1;
+  nthreads = 1;
  numa = 1;

  int iarg = 0;
@ -96,7 +96,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
      error->all(FLERR,"GPUs are requested but Kokkos has not been compiled for CUDA");
 #endif
      if (iarg+2 > narg) error->all(FLERR,"Invalid Kokkos command-line args");
-      ngpu = atoi(arg[iarg+1]);
+      ngpus = atoi(arg[iarg+1]);

      int skip_gpu = 9999;
      if (iarg+2 < narg && isdigit(arg[iarg+2][0])) {
@ -108,23 +108,23 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
      char *str;
      if ((str = getenv("SLURM_LOCALID"))) {
        int local_rank = atoi(str);
-        device = local_rank % ngpu;
+        device = local_rank % ngpus;
        if (device >= skip_gpu) device++;
      }
      if ((str = getenv("MV2_COMM_WORLD_LOCAL_RANK"))) {
        int local_rank = atoi(str);
-        device = local_rank % ngpu;
+        device = local_rank % ngpus;
        if (device >= skip_gpu) device++;
      }
      if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) {
        int local_rank = atoi(str);
-        device = local_rank % ngpu;
+        device = local_rank % ngpus;
        if (device >= skip_gpu) device++;
      }

    } else if (strcmp(arg[iarg],"t") == 0 ||
               strcmp(arg[iarg],"threads") == 0) {
-      num_threads = atoi(arg[iarg+1]);
+      nthreads = atoi(arg[iarg+1]);
      iarg += 2;

    } else if (strcmp(arg[iarg],"n") == 0 ||
@ -138,12 +138,12 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
  // initialize Kokkos

  if (me == 0) {
-    if (screen) fprintf(screen,"  will use up to %d GPU(s) per node\n",ngpu);
-    if (logfile) fprintf(logfile,"  will use up to %d GPU(s) per node\n",ngpu);
+    if (screen) fprintf(screen,"  will use up to %d GPU(s) per node\n",ngpus);
+    if (logfile) fprintf(logfile,"  will use up to %d GPU(s) per node\n",ngpus);
  }

 #ifdef KOKKOS_ENABLE_CUDA
-  if (ngpu <= 0)
+  if (ngpus <= 0)
    error->all(FLERR,"Kokkos has been compiled for CUDA but no GPUs are requested");

  // check and warn about GPU-direct availability when using multiple MPI tasks
@ -167,14 +167,14 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
 #endif

 #ifndef KOKKOS_ENABLE_SERIAL
-  if (num_threads == 1)
+  if (nthreads == 1)
    error->warning(FLERR,"When using a single thread, the Kokkos Serial backend "
                         "(i.e. Makefile.kokkos_mpi_only) gives better performance "
                         "than the OpenMP backend");
 #endif

  Kokkos::InitArguments args;
-  args.num_threads = num_threads;
+  args.num_threads = nthreads;
  args.num_numa = numa;
  args.device_id = device;

@ -187,14 +187,14 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
  neigh_thread = 0;
  neigh_thread_set = 0;
  neighflag_qeq_set = 0;
-  if (ngpu > 0) {
+  if (ngpus > 0) {
    neighflag = FULL;
    neighflag_qeq = FULL;
    newtonflag = 0;
    exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
    exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
  } else {
-    if (num_threads > 1) {
+    if (nthreads > 1) {
      neighflag = HALFTHREAD;
      neighflag_qeq = HALFTHREAD;
    } else {
@ -237,7 +237,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
      if (strcmp(arg[iarg+1],"full") == 0) neighflag = FULL;
      else if (strcmp(arg[iarg+1],"half") == 0) {
-        if (num_threads > 1 || ngpu > 0)
+        if (nthreads > 1 || ngpus > 0)
          neighflag = HALFTHREAD;
        else
          neighflag = HALF;
@ -249,7 +249,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
      if (strcmp(arg[iarg+1],"full") == 0) neighflag_qeq = FULL;
      else if (strcmp(arg[iarg+1],"half") == 0) {
-        if (num_threads > 1 || ngpu > 0)
+        if (nthreads > 1 || ngpus > 0)
          neighflag_qeq = HALFTHREAD;
        else
          neighflag_qeq = HALF;
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@ -32,7 +32,7 @@ class KokkosLMP : protected Pointers {
  int exchange_comm_on_host;
  int forward_comm_on_host;
  int reverse_comm_on_host;
-  int num_threads,ngpu;
+  int nthreads,ngpus;
  int numa;
  int auto_sync;
  int gpu_direct_flag;
--- a/src/KOKKOS/neighbor_kokkos.cpp
+++ b/src/KOKKOS/neighbor_kokkos.cpp
@ -362,7 +362,7 @@ void NeighborKokkos::modify_mol_intra_grow_kokkos(){

 /* ---------------------------------------------------------------------- */
 void NeighborKokkos::set_binsize_kokkos() {
-  if (!binsizeflag && lmp->kokkos->ngpu > 0) {
+  if (!binsizeflag && lmp->kokkos->ngpus > 0) {
    binsize_user = cutneighmax;
    binsizeflag = 1;
  }
--- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@ -310,12 +310,12 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)

 #else // No atomics

-  num_threads = lmp->kokkos->num_threads;
+  nthreads = lmp->kokkos->nthreads;
  int nmax = f.extent(0);
  if (nmax > t_f.extent(1)) {
-    t_f = t_f_array_thread("pair_exp6_rx:t_f",num_threads,nmax);
-    t_uCG = t_efloat_1d_thread("pair_exp6_rx:t_uCG",num_threads,nmax);
-    t_uCGnew = t_efloat_1d_thread("pair_exp6_rx:t_UCGnew",num_threads,nmax);
+    t_f = t_f_array_thread("pair_exp6_rx:t_f",nthreads,nmax);
+    t_uCG = t_efloat_1d_thread("pair_exp6_rx:t_uCG",nthreads,nmax);
+    t_uCGnew = t_efloat_1d_thread("pair_exp6_rx:t_UCGnew",nthreads,nmax);
  }

  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxZeroDupViews>(0,nmax),*this);
@ -1642,7 +1642,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxComputeNoAtomics<NEIG
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCollapseDupViews, const int &i) const {
-  for (int n = 0; n < num_threads; n++) {
+  for (int n = 0; n < nthreads; n++) {
    f(i,0) += t_f(n,i,0);
    f(i,1) += t_f(n,i,1);
    f(i,2) += t_f(n,i,2);
@ -1654,7 +1654,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCollapseDupViews, con
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxZeroDupViews, const int &i) const {
-  for (int n = 0; n < num_threads; n++) {
+  for (int n = 0; n < nthreads; n++) {
    t_f(n,i,0) = 0.0;
    t_f(n,i,1) = 0.0;
    t_f(n,i,2) = 0.0;
--- a/src/KOKKOS/pair_exp6_rx_kokkos.h
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@ -145,7 +145,7 @@ class PairExp6rxKokkos : public PairExp6rx {
  int eflag,vflag;
  int nlocal,newton_pair,neighflag;
  double special_lj[4];
-  int num_threads,ntypes;
+  int nthreads,ntypes;

  typename AT::t_x_array_randomread x;
  typename AT::t_f_array f;
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@ -1656,7 +1656,7 @@ void PPPMKokkos<DeviceType>::make_rho()
  iy = nyhi_out-nylo_out + 1;

  copymode = 1;
-  Kokkos::TeamPolicy<DeviceType, TagPPPM_make_rho> config(lmp->kokkos->num_threads,1);
+  Kokkos::TeamPolicy<DeviceType, TagPPPM_make_rho> config(lmp->kokkos->nthreads,1);
  Kokkos::parallel_for(config,*this);
  copymode = 0;
 #endif
--- a/src/KOKKOS/rand_pool_wrap_kokkos.cpp
+++ b/src/KOKKOS/rand_pool_wrap_kokkos.cpp
@ -25,7 +25,7 @@ using namespace LAMMPS_NS;
 RandPoolWrap::RandPoolWrap(int, LAMMPS *lmp) : Pointers(lmp)
 {
  random_thr =  NULL;
-  nthreads = lmp->kokkos->num_threads;
+  nthreads = lmp->kokkos->nthreads;
 }

 /* ---------------------------------------------------------------------- */
@ -59,7 +59,7 @@ void RandPoolWrap::init(RanMars* random, int seed)
  // allocate pool of RNGs
  // generate a random number generator instance for
  // all threads != 0. make sure we use unique seeds.
-  nthreads = lmp->kokkos->num_threads;
+  nthreads = lmp->kokkos->nthreads;
  random_thr = new RanMars*[nthreads];
  for (int tid = 1; tid < nthreads; ++tid) {
    random_thr[tid] = new RanMars(lmp, seed + comm->me
--- a/src/accelerator_kokkos.h
+++ b/src/accelerator_kokkos.h
@ -50,8 +50,8 @@ namespace LAMMPS_NS {
 class KokkosLMP {
 public:
  int kokkos_exists;
-  int num_threads;
-  int ngpu;
+  int nthreads;
+  int ngpus;
  int numa;

  KokkosLMP(class LAMMPS *, int, char **) {kokkos_exists = 0;}
--- a/src/comm.cpp
+++ b/src/comm.cpp
@ -81,7 +81,7 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
  nthreads = 1;
 #ifdef _OPENMP
  if (lmp->kokkos) {
-    nthreads = lmp->kokkos->num_threads * lmp->kokkos->numa;
+    nthreads = lmp->kokkos->nthreads * lmp->kokkos->numa;
  } else if (getenv("OMP_NUM_THREADS") == NULL) {
    nthreads = 1;
    if (me == 0)
--- a/src/finish.cpp
+++ b/src/finish.cpp
@ -176,9 +176,9 @@ void Finish::end(int flag)
          const char fmt2[] =
            "%.1f%% CPU use with %d MPI tasks x %d OpenMP threads\n";
          if (screen) fprintf(screen,fmt2,cpu_loop,nprocs,
-                              lmp->kokkos->num_threads);
+                              lmp->kokkos->nthreads);
          if (logfile) fprintf(logfile,fmt2,cpu_loop,nprocs,
-                               lmp->kokkos->num_threads);
+                               lmp->kokkos->nthreads);
        } else {
 #if defined(_OPENMP)
          const char fmt2[] =
@ -579,7 +579,7 @@ void Finish::end(int flag)
  }
 #endif

-  if (lmp->kokkos && lmp->kokkos->ngpu > 0)
+  if (lmp->kokkos && lmp->kokkos->ngpus > 0)
    if (const char* env_clb = getenv("CUDA_LAUNCH_BLOCKING"))
      if (!(strcmp(env_clb,"1") == 0)) {
        error->warning(FLERR,"Timing breakdown may not be accurate "