Merge pull request #1496 from akkamesh/enh-ext-reaxc

reaxc/qeq optimization - using kokkos hierarchical parallelism
This commit is contained in:
Axel Kohlmeyer 2019-06-10 21:37:12 -04:00 committed by GitHub
commit fe29572737
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 332 additions and 42 deletions

View File

@ -12,7 +12,8 @@
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Ray Shan (SNL), Stan Moore (SNL)
Contributing authors: Ray Shan (SNL), Stan Moore (SNL),
Kamesh Arumugam (NVIDIA)
------------------------------------------------------------------------- */
#include <cmath>
@ -68,6 +69,8 @@ FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) :
memory->destroy(s_hist);
memory->destroy(t_hist);
grow_arrays(atom->nmax);
d_mfill_offset = typename AT::t_int_scalar("qeq/kk:mfill_offset");
}
/* ---------------------------------------------------------------------- */
@ -217,17 +220,46 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
copymode = 1;
// allocate
allocate_array();
// get max number of neighbor
if (!allocated_flag || update->ntimestep == neighbor->lastcall)
allocate_matrix();
// compute_H
FixQEqReaxKokkosComputeHFunctor<DeviceType> computeH_functor(this);
Kokkos::parallel_scan(inum,computeH_functor);
if (lmp->kokkos->ngpus == 0) { // CPU
if (neighflag == FULL) {
FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(this);
Kokkos::parallel_scan(inum,computeH_functor);
} else { // HALF and HALFTHREAD are the same
FixQEqReaxKokkosComputeHFunctor<DeviceType, HALF> computeH_functor(this);
Kokkos::parallel_scan(inum,computeH_functor);
}
} else { // GPU, use teams
Kokkos::deep_copy(d_mfill_offset,0);
int vector_length = 32;
int atoms_per_team = 4;
int num_teams = inum / atoms_per_team + (inum % atoms_per_team ? 1 : 0);
Kokkos::TeamPolicy<DeviceType> policy(num_teams, atoms_per_team,
vector_length);
if (neighflag == FULL) {
FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(
this, atoms_per_team, vector_length);
Kokkos::parallel_for(policy, computeH_functor);
} else { // HALF and HALFTHREAD are the same
FixQEqReaxKokkosComputeHFunctor<DeviceType, HALF> computeH_functor(
this, atoms_per_team, vector_length);
Kokkos::parallel_for(policy, computeH_functor);
}
}
// init_matvec
k_s_hist.template sync<DeviceType>();
k_t_hist.template sync<DeviceType>();
FixQEqReaxKokkosMatVecFunctor<DeviceType> matvec_functor(this);
@ -257,12 +289,15 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
ndup_o = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated> (d_o);
// 1st cg solve over b_s, s
cg_solve1();
// 2nd cg solve over b_t, t
cg_solve2();
// calculate_Q();
calculate_q();
k_s_hist.template modify<DeviceType>();
k_t_hist.template modify<DeviceType>();
@ -273,6 +308,7 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
allocated_flag = 1;
// free duplicated memory
if (need_dup)
dup_o = decltype(dup_o)();
@ -377,6 +413,7 @@ void FixQEqReaxKokkos<DeviceType>::zero_item(int ii) const
/* ---------------------------------------------------------------------- */
template<class DeviceType>
template <int NEIGHFLAG>
KOKKOS_INLINE_FUNCTION
void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const bool &final) const
{
@ -403,7 +440,7 @@ void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const boo
const X_FLOAT dely = x(j,1) - ytmp;
const X_FLOAT delz = x(j,2) - ztmp;
if (neighflag != FULL) {
if (NEIGHFLAG != FULL) {
// skip half of the interactions
const tagint jtag = tag(j);
if (j >= nlocal) {
@ -437,6 +474,217 @@ void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const boo
/* ---------------------------------------------------------------------- */
// Calculate Qeq matrix H where H is a sparse matrix and H[i][j] represents the electrostatic interaction coefficients on atom-i with atom-j
// d_val - contains the non-zero entries of sparse matrix H
// d_numnbrs - d_numnbrs[i] contains the # of non-zero entries in the i-th row of H (which also represents the # of neighbor atoms with electrostatic interaction coefficients with atom-i)
// d_firstnbr- d_firstnbr[i] contains the beginning index from where the H matrix entries corresponding to row-i is stored in d_val
// d_jlist - contains the column index corresponding to each entry in d_val
template <class DeviceType>
template <int NEIGHFLAG>
void FixQEqReaxKokkos<DeviceType>::compute_h_team(
const typename Kokkos::TeamPolicy<DeviceType>::member_type &team,
int atoms_per_team, int vector_length) const {
// scratch space setup
Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
s_ilist(team.team_shmem(), atoms_per_team);
Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
s_numnbrs(team.team_shmem(), atoms_per_team);
Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
s_firstnbr(team.team_shmem(), atoms_per_team);
Kokkos::View<int **, Kokkos::ScratchMemorySpace<DeviceType>,
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
s_jtype(team.team_shmem(), atoms_per_team, vector_length);
Kokkos::View<int **, Kokkos::ScratchMemorySpace<DeviceType>,
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
s_jlist(team.team_shmem(), atoms_per_team, vector_length);
Kokkos::View<F_FLOAT **, Kokkos::ScratchMemorySpace<DeviceType>,
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
s_r(team.team_shmem(), atoms_per_team, vector_length);
// team of threads work on atoms with index in [firstatom, lastatom)
int firstatom = team.league_rank() * atoms_per_team;
int lastatom =
(firstatom + atoms_per_team < inum) ? (firstatom + atoms_per_team) : inum;
// kokkos-thread-0 is used to load info from global memory into scratch space
if (team.team_rank() == 0) {
// copy atom indices from d_ilist[firstatom:lastatom] to scratch space s_ilist[0:atoms_per_team]
// copy # of neighbor atoms for all the atoms with indices in d_ilist[firstatom:lastatom] from d_numneigh to scratch space s_numneigh[0:atoms_per_team]
// calculate total number of neighbor atoms for all atoms assigned to the current team of threads (Note - Total # of neighbor atoms here provides the
// upper bound space requirement to store the H matrix values corresponding to the atoms with indices in d_ilist[firstatom:lastatom])
Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, atoms_per_team),
[&](const int &idx, int &totalnbrs, bool final) {
int ii = firstatom + idx;
if (ii < inum) {
const int i = d_ilist[ii];
int jnum = d_numneigh[i];
if (final) {
s_ilist[idx] = i;
s_numnbrs[idx] = jnum;
s_firstnbr[idx] = totalnbrs;
}
totalnbrs += jnum;
} else {
s_numnbrs[idx] = 0;
}
});
}
// barrier ensures that the data moved to scratch space is visible to all the
// threads of the corresponding team
team.team_barrier();
// calculate the global memory offset from where the H matrix values to be
// calculated by the current team will be stored in d_val
int team_firstnbr_idx = 0;
Kokkos::single(Kokkos::PerTeam(team),
[=](int &val) {
int totalnbrs = s_firstnbr[lastatom - firstatom - 1] +
s_numnbrs[lastatom - firstatom - 1];
val = Kokkos::atomic_fetch_add(&d_mfill_offset(), totalnbrs);
},
team_firstnbr_idx);
// map the H matrix computation of each atom to kokkos-thread (one atom per
// kokkos-thread) neighbor computation for each atom is assigned to vector
// lanes of the corresponding thread
Kokkos::parallel_for(
Kokkos::TeamThreadRange(team, atoms_per_team), [&](const int &idx) {
int ii = firstatom + idx;
if (ii < inum) {
const int i = s_ilist[idx];
if (mask[i] & groupbit) {
const X_FLOAT xtmp = x(i, 0);
const X_FLOAT ytmp = x(i, 1);
const X_FLOAT ztmp = x(i, 2);
const int itype = type(i);
const tagint itag = tag(i);
const int jnum = s_numnbrs[idx];
// calculate the write-offset for atom-i's first neighbor
int atomi_firstnbr_idx = team_firstnbr_idx + s_firstnbr[idx];
Kokkos::single(Kokkos::PerThread(team),
[&]() { d_firstnbr[i] = atomi_firstnbr_idx; });
// current # of neighbor atoms with non-zero electrostatic
// interaction coefficients with atom-i which represents the # of
// non-zero elements in row-i of H matrix
int atomi_nbrs_inH = 0;
// calculate H matrix values corresponding to atom-i where neighbors
// are processed in batches and the batch size is vector_length
for (int jj_start = 0; jj_start < jnum; jj_start += vector_length) {
int atomi_nbr_writeIdx = atomi_firstnbr_idx + atomi_nbrs_inH;
// count the # of neighbor atoms with non-zero electrostatic
// interaction coefficients with atom-i in the current batch
int atomi_nbrs_curbatch = 0;
// compute rsq, jtype, j and store in scratch space which is
// reused later
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange(team, vector_length),
[&](const int &idx, int &m_fill) {
const int jj = jj_start + idx;
// initialize: -1 represents no interaction with atom-j
// where j = d_neighbors(i,jj)
s_jlist(team.team_rank(), idx) = -1;
if (jj < jnum) {
int j = d_neighbors(i, jj);
j &= NEIGHMASK;
const int jtype = type(j);
const X_FLOAT delx = x(j, 0) - xtmp;
const X_FLOAT dely = x(j, 1) - ytmp;
const X_FLOAT delz = x(j, 2) - ztmp;
// valid nbr interaction
bool valid = true;
if (NEIGHFLAG != FULL) {
// skip half of the interactions
const tagint jtag = tag(j);
if (j >= nlocal) {
if (itag > jtag) {
if ((itag + jtag) % 2 == 0)
valid = false;
} else if (itag < jtag) {
if ((itag + jtag) % 2 == 1)
valid = false;
} else {
if (x(j, 2) < ztmp)
valid = false;
if (x(j, 2) == ztmp && x(j, 1) < ytmp)
valid = false;
if (x(j, 2) == ztmp && x(j, 1) == ytmp &&
x(j, 0) < xtmp)
valid = false;
}
}
}
const F_FLOAT rsq =
delx * delx + dely * dely + delz * delz;
if (rsq > cutsq)
valid = false;
if (valid) {
s_jlist(team.team_rank(), idx) = j;
s_jtype(team.team_rank(), idx) = jtype;
s_r(team.team_rank(), idx) = sqrt(rsq);
m_fill++;
}
}
},
atomi_nbrs_curbatch);
// write non-zero entries of H to global memory
Kokkos::parallel_scan(
Kokkos::ThreadVectorRange(team, vector_length),
[&](const int &idx, int &m_fill, bool final) {
int j = s_jlist(team.team_rank(), idx);
if (final) {
if (j != -1) {
const int jtype = s_jtype(team.team_rank(), idx);
const F_FLOAT r = s_r(team.team_rank(), idx);
const F_FLOAT shldij = d_shield(itype, jtype);
d_jlist[atomi_nbr_writeIdx + m_fill] = j;
d_val[atomi_nbr_writeIdx + m_fill] =
calculate_H_k(r, shldij);
}
}
if (j != -1) {
m_fill++;
}
});
atomi_nbrs_inH += atomi_nbrs_curbatch;
}
Kokkos::single(Kokkos::PerThread(team),
[&]() { d_numnbrs[i] = atomi_nbrs_inH; });
}
}
});
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
double FixQEqReaxKokkos<DeviceType>::calculate_H_k(const F_FLOAT &r, const F_FLOAT &shld) const

View File

@ -53,9 +53,14 @@ class FixQEqReaxKokkos : public FixQEqReax {
KOKKOS_INLINE_FUNCTION
void zero_item(int) const;
template<int NEIGHFLAG>
KOKKOS_INLINE_FUNCTION
void compute_h_item(int, int &, const bool &) const;
template<int NEIGHFLAG>
KOKKOS_INLINE_FUNCTION
void compute_h_team(const typename Kokkos::TeamPolicy <DeviceType> ::member_type &team, int, int) const;
KOKKOS_INLINE_FUNCTION
void matvec_item(int) const;
@ -150,6 +155,8 @@ class FixQEqReaxKokkos : public FixQEqReax {
int allocated_flag;
int need_dup;
typename AT::t_int_scalar d_mfill_offset;
typedef Kokkos::DualView<int***,DeviceType> tdual_int_1d;
Kokkos::DualView<params_qeq*,Kokkos::LayoutRight,DeviceType> k_params;
typename Kokkos::DualView<params_qeq*, Kokkos::LayoutRight,DeviceType>::t_dev_const params;
@ -247,16 +254,51 @@ struct FixQEqReaxKokkosMatVecFunctor {
}
};
template <class DeviceType>
struct FixQEqReaxKokkosComputeHFunctor {
typedef DeviceType device_type ;
template <class DeviceType, int NEIGHFLAG>
struct FixQEqReaxKokkosComputeHFunctor {
int atoms_per_team, vector_length;
typedef int value_type;
typedef Kokkos::ScratchMemorySpace<DeviceType> scratch_space;
FixQEqReaxKokkos<DeviceType> c;
FixQEqReaxKokkosComputeHFunctor(FixQEqReaxKokkos<DeviceType>* c_ptr):c(*c_ptr) {
c.cleanup_copy();
};
FixQEqReaxKokkosComputeHFunctor(FixQEqReaxKokkos<DeviceType> *c_ptr,
int _atoms_per_team, int _vector_length)
: c(*c_ptr), atoms_per_team(_atoms_per_team),
vector_length(_vector_length) {
c.cleanup_copy();
};
KOKKOS_INLINE_FUNCTION
void operator()(const int ii, int &m_fill, const bool &final) const {
c.compute_h_item(ii,m_fill,final);
c.template compute_h_item<NEIGHFLAG>(ii,m_fill,final);
}
KOKKOS_INLINE_FUNCTION
void operator()(
const typename Kokkos::TeamPolicy<DeviceType>::member_type &team) const {
c.template compute_h_team<NEIGHFLAG>(team, atoms_per_team, vector_length);
}
size_t team_shmem_size(int team_size) const {
size_t shmem_size =
Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
atoms_per_team) + // s_ilist
Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
atoms_per_team) + // s_numnbrs
Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
atoms_per_team) + // s_firstnbr
Kokkos::View<int **, scratch_space, Kokkos::MemoryUnmanaged>::
shmem_size(atoms_per_team, vector_length) + // s_jtype
Kokkos::View<int **, scratch_space, Kokkos::MemoryUnmanaged>::
shmem_size(atoms_per_team, vector_length) + // s_j
Kokkos::View<F_FLOAT **, scratch_space,
Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team,
vector_length); // s_r
return shmem_size;
}
};

View File

@ -78,9 +78,9 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
// process any command-line args that invoke Kokkos settings
ngpu = 0;
ngpus = 0;
int device = 0;
num_threads = 1;
nthreads = 1;
numa = 1;
int iarg = 0;
@ -96,7 +96,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
error->all(FLERR,"GPUs are requested but Kokkos has not been compiled for CUDA");
#endif
if (iarg+2 > narg) error->all(FLERR,"Invalid Kokkos command-line args");
ngpu = atoi(arg[iarg+1]);
ngpus = atoi(arg[iarg+1]);
int skip_gpu = 9999;
if (iarg+2 < narg && isdigit(arg[iarg+2][0])) {
@ -108,23 +108,23 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
char *str;
if ((str = getenv("SLURM_LOCALID"))) {
int local_rank = atoi(str);
device = local_rank % ngpu;
device = local_rank % ngpus;
if (device >= skip_gpu) device++;
}
if ((str = getenv("MV2_COMM_WORLD_LOCAL_RANK"))) {
int local_rank = atoi(str);
device = local_rank % ngpu;
device = local_rank % ngpus;
if (device >= skip_gpu) device++;
}
if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) {
int local_rank = atoi(str);
device = local_rank % ngpu;
device = local_rank % ngpus;
if (device >= skip_gpu) device++;
}
} else if (strcmp(arg[iarg],"t") == 0 ||
strcmp(arg[iarg],"threads") == 0) {
num_threads = atoi(arg[iarg+1]);
nthreads = atoi(arg[iarg+1]);
iarg += 2;
} else if (strcmp(arg[iarg],"n") == 0 ||
@ -138,12 +138,12 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
// initialize Kokkos
if (me == 0) {
if (screen) fprintf(screen," will use up to %d GPU(s) per node\n",ngpu);
if (logfile) fprintf(logfile," will use up to %d GPU(s) per node\n",ngpu);
if (screen) fprintf(screen," will use up to %d GPU(s) per node\n",ngpus);
if (logfile) fprintf(logfile," will use up to %d GPU(s) per node\n",ngpus);
}
#ifdef KOKKOS_ENABLE_CUDA
if (ngpu <= 0)
if (ngpus <= 0)
error->all(FLERR,"Kokkos has been compiled for CUDA but no GPUs are requested");
// check and warn about GPU-direct availability when using multiple MPI tasks
@ -167,14 +167,14 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
#endif
#ifndef KOKKOS_ENABLE_SERIAL
if (num_threads == 1)
if (nthreads == 1)
error->warning(FLERR,"When using a single thread, the Kokkos Serial backend "
"(i.e. Makefile.kokkos_mpi_only) gives better performance "
"than the OpenMP backend");
#endif
Kokkos::InitArguments args;
args.num_threads = num_threads;
args.num_threads = nthreads;
args.num_numa = numa;
args.device_id = device;
@ -187,14 +187,14 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
neigh_thread = 0;
neigh_thread_set = 0;
neighflag_qeq_set = 0;
if (ngpu > 0) {
if (ngpus > 0) {
neighflag = FULL;
neighflag_qeq = FULL;
newtonflag = 0;
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
} else {
if (num_threads > 1) {
if (nthreads > 1) {
neighflag = HALFTHREAD;
neighflag_qeq = HALFTHREAD;
} else {
@ -237,7 +237,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
if (strcmp(arg[iarg+1],"full") == 0) neighflag = FULL;
else if (strcmp(arg[iarg+1],"half") == 0) {
if (num_threads > 1 || ngpu > 0)
if (nthreads > 1 || ngpus > 0)
neighflag = HALFTHREAD;
else
neighflag = HALF;
@ -249,7 +249,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
if (strcmp(arg[iarg+1],"full") == 0) neighflag_qeq = FULL;
else if (strcmp(arg[iarg+1],"half") == 0) {
if (num_threads > 1 || ngpu > 0)
if (nthreads > 1 || ngpus > 0)
neighflag_qeq = HALFTHREAD;
else
neighflag_qeq = HALF;

View File

@ -32,7 +32,7 @@ class KokkosLMP : protected Pointers {
int exchange_comm_on_host;
int forward_comm_on_host;
int reverse_comm_on_host;
int num_threads,ngpu;
int nthreads,ngpus;
int numa;
int auto_sync;
int gpu_direct_flag;

View File

@ -362,7 +362,7 @@ void NeighborKokkos::modify_mol_intra_grow_kokkos(){
/* ---------------------------------------------------------------------- */
void NeighborKokkos::set_binsize_kokkos() {
if (!binsizeflag && lmp->kokkos->ngpu > 0) {
if (!binsizeflag && lmp->kokkos->ngpus > 0) {
binsize_user = cutneighmax;
binsizeflag = 1;
}

View File

@ -310,12 +310,12 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
#else // No atomics
num_threads = lmp->kokkos->num_threads;
nthreads = lmp->kokkos->nthreads;
int nmax = f.extent(0);
if (nmax > t_f.extent(1)) {
t_f = t_f_array_thread("pair_exp6_rx:t_f",num_threads,nmax);
t_uCG = t_efloat_1d_thread("pair_exp6_rx:t_uCG",num_threads,nmax);
t_uCGnew = t_efloat_1d_thread("pair_exp6_rx:t_UCGnew",num_threads,nmax);
t_f = t_f_array_thread("pair_exp6_rx:t_f",nthreads,nmax);
t_uCG = t_efloat_1d_thread("pair_exp6_rx:t_uCG",nthreads,nmax);
t_uCGnew = t_efloat_1d_thread("pair_exp6_rx:t_UCGnew",nthreads,nmax);
}
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxZeroDupViews>(0,nmax),*this);
@ -1642,7 +1642,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxComputeNoAtomics<NEIG
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCollapseDupViews, const int &i) const {
for (int n = 0; n < num_threads; n++) {
for (int n = 0; n < nthreads; n++) {
f(i,0) += t_f(n,i,0);
f(i,1) += t_f(n,i,1);
f(i,2) += t_f(n,i,2);
@ -1654,7 +1654,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCollapseDupViews, con
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxZeroDupViews, const int &i) const {
for (int n = 0; n < num_threads; n++) {
for (int n = 0; n < nthreads; n++) {
t_f(n,i,0) = 0.0;
t_f(n,i,1) = 0.0;
t_f(n,i,2) = 0.0;

View File

@ -145,7 +145,7 @@ class PairExp6rxKokkos : public PairExp6rx {
int eflag,vflag;
int nlocal,newton_pair,neighflag;
double special_lj[4];
int num_threads,ntypes;
int nthreads,ntypes;
typename AT::t_x_array_randomread x;
typename AT::t_f_array f;

View File

@ -1656,7 +1656,7 @@ void PPPMKokkos<DeviceType>::make_rho()
iy = nyhi_out-nylo_out + 1;
copymode = 1;
Kokkos::TeamPolicy<DeviceType, TagPPPM_make_rho> config(lmp->kokkos->num_threads,1);
Kokkos::TeamPolicy<DeviceType, TagPPPM_make_rho> config(lmp->kokkos->nthreads,1);
Kokkos::parallel_for(config,*this);
copymode = 0;
#endif

View File

@ -25,7 +25,7 @@ using namespace LAMMPS_NS;
RandPoolWrap::RandPoolWrap(int, LAMMPS *lmp) : Pointers(lmp)
{
random_thr = NULL;
nthreads = lmp->kokkos->num_threads;
nthreads = lmp->kokkos->nthreads;
}
/* ---------------------------------------------------------------------- */
@ -59,7 +59,7 @@ void RandPoolWrap::init(RanMars* random, int seed)
// allocate pool of RNGs
// generate a random number generator instance for
// all threads != 0. make sure we use unique seeds.
nthreads = lmp->kokkos->num_threads;
nthreads = lmp->kokkos->nthreads;
random_thr = new RanMars*[nthreads];
for (int tid = 1; tid < nthreads; ++tid) {
random_thr[tid] = new RanMars(lmp, seed + comm->me

View File

@ -50,8 +50,8 @@ namespace LAMMPS_NS {
class KokkosLMP {
public:
int kokkos_exists;
int num_threads;
int ngpu;
int nthreads;
int ngpus;
int numa;
KokkosLMP(class LAMMPS *, int, char **) {kokkos_exists = 0;}

View File

@ -81,7 +81,7 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
nthreads = 1;
#ifdef _OPENMP
if (lmp->kokkos) {
nthreads = lmp->kokkos->num_threads * lmp->kokkos->numa;
nthreads = lmp->kokkos->nthreads * lmp->kokkos->numa;
} else if (getenv("OMP_NUM_THREADS") == NULL) {
nthreads = 1;
if (me == 0)

View File

@ -176,9 +176,9 @@ void Finish::end(int flag)
const char fmt2[] =
"%.1f%% CPU use with %d MPI tasks x %d OpenMP threads\n";
if (screen) fprintf(screen,fmt2,cpu_loop,nprocs,
lmp->kokkos->num_threads);
lmp->kokkos->nthreads);
if (logfile) fprintf(logfile,fmt2,cpu_loop,nprocs,
lmp->kokkos->num_threads);
lmp->kokkos->nthreads);
} else {
#if defined(_OPENMP)
const char fmt2[] =
@ -579,7 +579,7 @@ void Finish::end(int flag)
}
#endif
if (lmp->kokkos && lmp->kokkos->ngpu > 0)
if (lmp->kokkos && lmp->kokkos->ngpus > 0)
if (const char* env_clb = getenv("CUDA_LAUNCH_BLOCKING"))
if (!(strcmp(env_clb,"1") == 0)) {
error->warning(FLERR,"Timing breakdown may not be accurate "