forked from lijiext/lammps
Merge pull request #1496 from akkamesh/enh-ext-reaxc
reaxc/qeq optimization - using kokkos hierarchical parallelism
This commit is contained in:
commit
fe29572737
|
@ -12,7 +12,8 @@
|
|||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: Ray Shan (SNL), Stan Moore (SNL)
|
||||
Contributing authors: Ray Shan (SNL), Stan Moore (SNL),
|
||||
Kamesh Arumugam (NVIDIA)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <cmath>
|
||||
|
@ -68,6 +69,8 @@ FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) :
|
|||
memory->destroy(s_hist);
|
||||
memory->destroy(t_hist);
|
||||
grow_arrays(atom->nmax);
|
||||
|
||||
d_mfill_offset = typename AT::t_int_scalar("qeq/kk:mfill_offset");
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
@ -217,17 +220,46 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
|
|||
copymode = 1;
|
||||
|
||||
// allocate
|
||||
|
||||
allocate_array();
|
||||
|
||||
// get max number of neighbor
|
||||
|
||||
if (!allocated_flag || update->ntimestep == neighbor->lastcall)
|
||||
allocate_matrix();
|
||||
|
||||
// compute_H
|
||||
FixQEqReaxKokkosComputeHFunctor<DeviceType> computeH_functor(this);
|
||||
Kokkos::parallel_scan(inum,computeH_functor);
|
||||
|
||||
if (lmp->kokkos->ngpus == 0) { // CPU
|
||||
if (neighflag == FULL) {
|
||||
FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(this);
|
||||
Kokkos::parallel_scan(inum,computeH_functor);
|
||||
} else { // HALF and HALFTHREAD are the same
|
||||
FixQEqReaxKokkosComputeHFunctor<DeviceType, HALF> computeH_functor(this);
|
||||
Kokkos::parallel_scan(inum,computeH_functor);
|
||||
}
|
||||
} else { // GPU, use teams
|
||||
Kokkos::deep_copy(d_mfill_offset,0);
|
||||
|
||||
int vector_length = 32;
|
||||
int atoms_per_team = 4;
|
||||
int num_teams = inum / atoms_per_team + (inum % atoms_per_team ? 1 : 0);
|
||||
|
||||
Kokkos::TeamPolicy<DeviceType> policy(num_teams, atoms_per_team,
|
||||
vector_length);
|
||||
if (neighflag == FULL) {
|
||||
FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(
|
||||
this, atoms_per_team, vector_length);
|
||||
Kokkos::parallel_for(policy, computeH_functor);
|
||||
} else { // HALF and HALFTHREAD are the same
|
||||
FixQEqReaxKokkosComputeHFunctor<DeviceType, HALF> computeH_functor(
|
||||
this, atoms_per_team, vector_length);
|
||||
Kokkos::parallel_for(policy, computeH_functor);
|
||||
}
|
||||
}
|
||||
|
||||
// init_matvec
|
||||
|
||||
k_s_hist.template sync<DeviceType>();
|
||||
k_t_hist.template sync<DeviceType>();
|
||||
FixQEqReaxKokkosMatVecFunctor<DeviceType> matvec_functor(this);
|
||||
|
@ -257,12 +289,15 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
|
|||
ndup_o = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated> (d_o);
|
||||
|
||||
// 1st cg solve over b_s, s
|
||||
|
||||
cg_solve1();
|
||||
|
||||
// 2nd cg solve over b_t, t
|
||||
|
||||
cg_solve2();
|
||||
|
||||
// calculate_Q();
|
||||
|
||||
calculate_q();
|
||||
k_s_hist.template modify<DeviceType>();
|
||||
k_t_hist.template modify<DeviceType>();
|
||||
|
@ -273,6 +308,7 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
|
|||
allocated_flag = 1;
|
||||
|
||||
// free duplicated memory
|
||||
|
||||
if (need_dup)
|
||||
dup_o = decltype(dup_o)();
|
||||
|
||||
|
@ -377,6 +413,7 @@ void FixQEqReaxKokkos<DeviceType>::zero_item(int ii) const
|
|||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
template <int NEIGHFLAG>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const bool &final) const
|
||||
{
|
||||
|
@ -403,7 +440,7 @@ void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const boo
|
|||
const X_FLOAT dely = x(j,1) - ytmp;
|
||||
const X_FLOAT delz = x(j,2) - ztmp;
|
||||
|
||||
if (neighflag != FULL) {
|
||||
if (NEIGHFLAG != FULL) {
|
||||
// skip half of the interactions
|
||||
const tagint jtag = tag(j);
|
||||
if (j >= nlocal) {
|
||||
|
@ -437,6 +474,217 @@ void FixQEqReaxKokkos<DeviceType>::compute_h_item(int ii, int &m_fill, const boo
|
|||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
// Calculate Qeq matrix H where H is a sparse matrix and H[i][j] represents the electrostatic interaction coefficients on atom-i with atom-j
|
||||
// d_val - contains the non-zero entries of sparse matrix H
|
||||
// d_numnbrs - d_numnbrs[i] contains the # of non-zero entries in the i-th row of H (which also represents the # of neighbor atoms with electrostatic interaction coefficients with atom-i)
|
||||
// d_firstnbr- d_firstnbr[i] contains the beginning index from where the H matrix entries corresponding to row-i is stored in d_val
|
||||
// d_jlist - contains the column index corresponding to each entry in d_val
|
||||
|
||||
template <class DeviceType>
|
||||
template <int NEIGHFLAG>
|
||||
void FixQEqReaxKokkos<DeviceType>::compute_h_team(
|
||||
const typename Kokkos::TeamPolicy<DeviceType>::member_type &team,
|
||||
int atoms_per_team, int vector_length) const {
|
||||
|
||||
// scratch space setup
|
||||
Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
|
||||
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
||||
s_ilist(team.team_shmem(), atoms_per_team);
|
||||
Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
|
||||
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
||||
s_numnbrs(team.team_shmem(), atoms_per_team);
|
||||
Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
|
||||
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
||||
s_firstnbr(team.team_shmem(), atoms_per_team);
|
||||
|
||||
Kokkos::View<int **, Kokkos::ScratchMemorySpace<DeviceType>,
|
||||
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
||||
s_jtype(team.team_shmem(), atoms_per_team, vector_length);
|
||||
Kokkos::View<int **, Kokkos::ScratchMemorySpace<DeviceType>,
|
||||
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
||||
s_jlist(team.team_shmem(), atoms_per_team, vector_length);
|
||||
Kokkos::View<F_FLOAT **, Kokkos::ScratchMemorySpace<DeviceType>,
|
||||
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
||||
s_r(team.team_shmem(), atoms_per_team, vector_length);
|
||||
|
||||
// team of threads work on atoms with index in [firstatom, lastatom)
|
||||
int firstatom = team.league_rank() * atoms_per_team;
|
||||
int lastatom =
|
||||
(firstatom + atoms_per_team < inum) ? (firstatom + atoms_per_team) : inum;
|
||||
|
||||
// kokkos-thread-0 is used to load info from global memory into scratch space
|
||||
if (team.team_rank() == 0) {
|
||||
|
||||
// copy atom indices from d_ilist[firstatom:lastatom] to scratch space s_ilist[0:atoms_per_team]
|
||||
// copy # of neighbor atoms for all the atoms with indices in d_ilist[firstatom:lastatom] from d_numneigh to scratch space s_numneigh[0:atoms_per_team]
|
||||
// calculate total number of neighbor atoms for all atoms assigned to the current team of threads (Note - Total # of neighbor atoms here provides the
|
||||
// upper bound space requirement to store the H matrix values corresponding to the atoms with indices in d_ilist[firstatom:lastatom])
|
||||
|
||||
Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, atoms_per_team),
|
||||
[&](const int &idx, int &totalnbrs, bool final) {
|
||||
int ii = firstatom + idx;
|
||||
|
||||
if (ii < inum) {
|
||||
const int i = d_ilist[ii];
|
||||
int jnum = d_numneigh[i];
|
||||
|
||||
if (final) {
|
||||
s_ilist[idx] = i;
|
||||
s_numnbrs[idx] = jnum;
|
||||
s_firstnbr[idx] = totalnbrs;
|
||||
}
|
||||
totalnbrs += jnum;
|
||||
} else {
|
||||
s_numnbrs[idx] = 0;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// barrier ensures that the data moved to scratch space is visible to all the
|
||||
// threads of the corresponding team
|
||||
team.team_barrier();
|
||||
|
||||
// calculate the global memory offset from where the H matrix values to be
|
||||
// calculated by the current team will be stored in d_val
|
||||
int team_firstnbr_idx = 0;
|
||||
Kokkos::single(Kokkos::PerTeam(team),
|
||||
[=](int &val) {
|
||||
int totalnbrs = s_firstnbr[lastatom - firstatom - 1] +
|
||||
s_numnbrs[lastatom - firstatom - 1];
|
||||
val = Kokkos::atomic_fetch_add(&d_mfill_offset(), totalnbrs);
|
||||
},
|
||||
team_firstnbr_idx);
|
||||
|
||||
// map the H matrix computation of each atom to kokkos-thread (one atom per
|
||||
// kokkos-thread) neighbor computation for each atom is assigned to vector
|
||||
// lanes of the corresponding thread
|
||||
Kokkos::parallel_for(
|
||||
Kokkos::TeamThreadRange(team, atoms_per_team), [&](const int &idx) {
|
||||
int ii = firstatom + idx;
|
||||
|
||||
if (ii < inum) {
|
||||
const int i = s_ilist[idx];
|
||||
|
||||
if (mask[i] & groupbit) {
|
||||
const X_FLOAT xtmp = x(i, 0);
|
||||
const X_FLOAT ytmp = x(i, 1);
|
||||
const X_FLOAT ztmp = x(i, 2);
|
||||
const int itype = type(i);
|
||||
const tagint itag = tag(i);
|
||||
const int jnum = s_numnbrs[idx];
|
||||
|
||||
// calculate the write-offset for atom-i's first neighbor
|
||||
int atomi_firstnbr_idx = team_firstnbr_idx + s_firstnbr[idx];
|
||||
Kokkos::single(Kokkos::PerThread(team),
|
||||
[&]() { d_firstnbr[i] = atomi_firstnbr_idx; });
|
||||
|
||||
// current # of neighbor atoms with non-zero electrostatic
|
||||
// interaction coefficients with atom-i which represents the # of
|
||||
// non-zero elements in row-i of H matrix
|
||||
int atomi_nbrs_inH = 0;
|
||||
|
||||
// calculate H matrix values corresponding to atom-i where neighbors
|
||||
// are processed in batches and the batch size is vector_length
|
||||
for (int jj_start = 0; jj_start < jnum; jj_start += vector_length) {
|
||||
|
||||
int atomi_nbr_writeIdx = atomi_firstnbr_idx + atomi_nbrs_inH;
|
||||
|
||||
// count the # of neighbor atoms with non-zero electrostatic
|
||||
// interaction coefficients with atom-i in the current batch
|
||||
int atomi_nbrs_curbatch = 0;
|
||||
|
||||
// compute rsq, jtype, j and store in scratch space which is
|
||||
// reused later
|
||||
Kokkos::parallel_reduce(
|
||||
Kokkos::ThreadVectorRange(team, vector_length),
|
||||
[&](const int &idx, int &m_fill) {
|
||||
const int jj = jj_start + idx;
|
||||
|
||||
// initialize: -1 represents no interaction with atom-j
|
||||
// where j = d_neighbors(i,jj)
|
||||
s_jlist(team.team_rank(), idx) = -1;
|
||||
|
||||
if (jj < jnum) {
|
||||
int j = d_neighbors(i, jj);
|
||||
j &= NEIGHMASK;
|
||||
const int jtype = type(j);
|
||||
|
||||
const X_FLOAT delx = x(j, 0) - xtmp;
|
||||
const X_FLOAT dely = x(j, 1) - ytmp;
|
||||
const X_FLOAT delz = x(j, 2) - ztmp;
|
||||
|
||||
// valid nbr interaction
|
||||
bool valid = true;
|
||||
if (NEIGHFLAG != FULL) {
|
||||
// skip half of the interactions
|
||||
const tagint jtag = tag(j);
|
||||
if (j >= nlocal) {
|
||||
if (itag > jtag) {
|
||||
if ((itag + jtag) % 2 == 0)
|
||||
valid = false;
|
||||
} else if (itag < jtag) {
|
||||
if ((itag + jtag) % 2 == 1)
|
||||
valid = false;
|
||||
} else {
|
||||
if (x(j, 2) < ztmp)
|
||||
valid = false;
|
||||
if (x(j, 2) == ztmp && x(j, 1) < ytmp)
|
||||
valid = false;
|
||||
if (x(j, 2) == ztmp && x(j, 1) == ytmp &&
|
||||
x(j, 0) < xtmp)
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const F_FLOAT rsq =
|
||||
delx * delx + dely * dely + delz * delz;
|
||||
if (rsq > cutsq)
|
||||
valid = false;
|
||||
|
||||
if (valid) {
|
||||
s_jlist(team.team_rank(), idx) = j;
|
||||
s_jtype(team.team_rank(), idx) = jtype;
|
||||
s_r(team.team_rank(), idx) = sqrt(rsq);
|
||||
m_fill++;
|
||||
}
|
||||
}
|
||||
},
|
||||
atomi_nbrs_curbatch);
|
||||
|
||||
// write non-zero entries of H to global memory
|
||||
Kokkos::parallel_scan(
|
||||
Kokkos::ThreadVectorRange(team, vector_length),
|
||||
[&](const int &idx, int &m_fill, bool final) {
|
||||
int j = s_jlist(team.team_rank(), idx);
|
||||
if (final) {
|
||||
if (j != -1) {
|
||||
const int jtype = s_jtype(team.team_rank(), idx);
|
||||
const F_FLOAT r = s_r(team.team_rank(), idx);
|
||||
const F_FLOAT shldij = d_shield(itype, jtype);
|
||||
|
||||
d_jlist[atomi_nbr_writeIdx + m_fill] = j;
|
||||
d_val[atomi_nbr_writeIdx + m_fill] =
|
||||
calculate_H_k(r, shldij);
|
||||
}
|
||||
}
|
||||
|
||||
if (j != -1) {
|
||||
m_fill++;
|
||||
}
|
||||
});
|
||||
atomi_nbrs_inH += atomi_nbrs_curbatch;
|
||||
}
|
||||
|
||||
Kokkos::single(Kokkos::PerThread(team),
|
||||
[&]() { d_numnbrs[i] = atomi_nbrs_inH; });
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double FixQEqReaxKokkos<DeviceType>::calculate_H_k(const F_FLOAT &r, const F_FLOAT &shld) const
|
||||
|
|
|
@ -53,9 +53,14 @@ class FixQEqReaxKokkos : public FixQEqReax {
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
void zero_item(int) const;
|
||||
|
||||
template<int NEIGHFLAG>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_h_item(int, int &, const bool &) const;
|
||||
|
||||
template<int NEIGHFLAG>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_h_team(const typename Kokkos::TeamPolicy <DeviceType> ::member_type &team, int, int) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void matvec_item(int) const;
|
||||
|
||||
|
@ -150,6 +155,8 @@ class FixQEqReaxKokkos : public FixQEqReax {
|
|||
int allocated_flag;
|
||||
int need_dup;
|
||||
|
||||
typename AT::t_int_scalar d_mfill_offset;
|
||||
|
||||
typedef Kokkos::DualView<int***,DeviceType> tdual_int_1d;
|
||||
Kokkos::DualView<params_qeq*,Kokkos::LayoutRight,DeviceType> k_params;
|
||||
typename Kokkos::DualView<params_qeq*, Kokkos::LayoutRight,DeviceType>::t_dev_const params;
|
||||
|
@ -247,16 +254,51 @@ struct FixQEqReaxKokkosMatVecFunctor {
|
|||
}
|
||||
};
|
||||
|
||||
template <class DeviceType>
|
||||
struct FixQEqReaxKokkosComputeHFunctor {
|
||||
typedef DeviceType device_type ;
|
||||
template <class DeviceType, int NEIGHFLAG>
|
||||
struct FixQEqReaxKokkosComputeHFunctor {
|
||||
int atoms_per_team, vector_length;
|
||||
typedef int value_type;
|
||||
typedef Kokkos::ScratchMemorySpace<DeviceType> scratch_space;
|
||||
FixQEqReaxKokkos<DeviceType> c;
|
||||
|
||||
FixQEqReaxKokkosComputeHFunctor(FixQEqReaxKokkos<DeviceType>* c_ptr):c(*c_ptr) {
|
||||
c.cleanup_copy();
|
||||
};
|
||||
|
||||
FixQEqReaxKokkosComputeHFunctor(FixQEqReaxKokkos<DeviceType> *c_ptr,
|
||||
int _atoms_per_team, int _vector_length)
|
||||
: c(*c_ptr), atoms_per_team(_atoms_per_team),
|
||||
vector_length(_vector_length) {
|
||||
c.cleanup_copy();
|
||||
};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const int ii, int &m_fill, const bool &final) const {
|
||||
c.compute_h_item(ii,m_fill,final);
|
||||
c.template compute_h_item<NEIGHFLAG>(ii,m_fill,final);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(
|
||||
const typename Kokkos::TeamPolicy<DeviceType>::member_type &team) const {
|
||||
c.template compute_h_team<NEIGHFLAG>(team, atoms_per_team, vector_length);
|
||||
}
|
||||
|
||||
size_t team_shmem_size(int team_size) const {
|
||||
size_t shmem_size =
|
||||
Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
|
||||
atoms_per_team) + // s_ilist
|
||||
Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
|
||||
atoms_per_team) + // s_numnbrs
|
||||
Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
|
||||
atoms_per_team) + // s_firstnbr
|
||||
Kokkos::View<int **, scratch_space, Kokkos::MemoryUnmanaged>::
|
||||
shmem_size(atoms_per_team, vector_length) + // s_jtype
|
||||
Kokkos::View<int **, scratch_space, Kokkos::MemoryUnmanaged>::
|
||||
shmem_size(atoms_per_team, vector_length) + // s_j
|
||||
Kokkos::View<F_FLOAT **, scratch_space,
|
||||
Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team,
|
||||
vector_length); // s_r
|
||||
return shmem_size;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -78,9 +78,9 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
|||
|
||||
// process any command-line args that invoke Kokkos settings
|
||||
|
||||
ngpu = 0;
|
||||
ngpus = 0;
|
||||
int device = 0;
|
||||
num_threads = 1;
|
||||
nthreads = 1;
|
||||
numa = 1;
|
||||
|
||||
int iarg = 0;
|
||||
|
@ -96,7 +96,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
|||
error->all(FLERR,"GPUs are requested but Kokkos has not been compiled for CUDA");
|
||||
#endif
|
||||
if (iarg+2 > narg) error->all(FLERR,"Invalid Kokkos command-line args");
|
||||
ngpu = atoi(arg[iarg+1]);
|
||||
ngpus = atoi(arg[iarg+1]);
|
||||
|
||||
int skip_gpu = 9999;
|
||||
if (iarg+2 < narg && isdigit(arg[iarg+2][0])) {
|
||||
|
@ -108,23 +108,23 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
|||
char *str;
|
||||
if ((str = getenv("SLURM_LOCALID"))) {
|
||||
int local_rank = atoi(str);
|
||||
device = local_rank % ngpu;
|
||||
device = local_rank % ngpus;
|
||||
if (device >= skip_gpu) device++;
|
||||
}
|
||||
if ((str = getenv("MV2_COMM_WORLD_LOCAL_RANK"))) {
|
||||
int local_rank = atoi(str);
|
||||
device = local_rank % ngpu;
|
||||
device = local_rank % ngpus;
|
||||
if (device >= skip_gpu) device++;
|
||||
}
|
||||
if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) {
|
||||
int local_rank = atoi(str);
|
||||
device = local_rank % ngpu;
|
||||
device = local_rank % ngpus;
|
||||
if (device >= skip_gpu) device++;
|
||||
}
|
||||
|
||||
} else if (strcmp(arg[iarg],"t") == 0 ||
|
||||
strcmp(arg[iarg],"threads") == 0) {
|
||||
num_threads = atoi(arg[iarg+1]);
|
||||
nthreads = atoi(arg[iarg+1]);
|
||||
iarg += 2;
|
||||
|
||||
} else if (strcmp(arg[iarg],"n") == 0 ||
|
||||
|
@ -138,12 +138,12 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
|||
// initialize Kokkos
|
||||
|
||||
if (me == 0) {
|
||||
if (screen) fprintf(screen," will use up to %d GPU(s) per node\n",ngpu);
|
||||
if (logfile) fprintf(logfile," will use up to %d GPU(s) per node\n",ngpu);
|
||||
if (screen) fprintf(screen," will use up to %d GPU(s) per node\n",ngpus);
|
||||
if (logfile) fprintf(logfile," will use up to %d GPU(s) per node\n",ngpus);
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
if (ngpu <= 0)
|
||||
if (ngpus <= 0)
|
||||
error->all(FLERR,"Kokkos has been compiled for CUDA but no GPUs are requested");
|
||||
|
||||
// check and warn about GPU-direct availability when using multiple MPI tasks
|
||||
|
@ -167,14 +167,14 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
|||
#endif
|
||||
|
||||
#ifndef KOKKOS_ENABLE_SERIAL
|
||||
if (num_threads == 1)
|
||||
if (nthreads == 1)
|
||||
error->warning(FLERR,"When using a single thread, the Kokkos Serial backend "
|
||||
"(i.e. Makefile.kokkos_mpi_only) gives better performance "
|
||||
"than the OpenMP backend");
|
||||
#endif
|
||||
|
||||
Kokkos::InitArguments args;
|
||||
args.num_threads = num_threads;
|
||||
args.num_threads = nthreads;
|
||||
args.num_numa = numa;
|
||||
args.device_id = device;
|
||||
|
||||
|
@ -187,14 +187,14 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
|||
neigh_thread = 0;
|
||||
neigh_thread_set = 0;
|
||||
neighflag_qeq_set = 0;
|
||||
if (ngpu > 0) {
|
||||
if (ngpus > 0) {
|
||||
neighflag = FULL;
|
||||
neighflag_qeq = FULL;
|
||||
newtonflag = 0;
|
||||
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
|
||||
exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
|
||||
} else {
|
||||
if (num_threads > 1) {
|
||||
if (nthreads > 1) {
|
||||
neighflag = HALFTHREAD;
|
||||
neighflag_qeq = HALFTHREAD;
|
||||
} else {
|
||||
|
@ -237,7 +237,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
|
|||
if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
|
||||
if (strcmp(arg[iarg+1],"full") == 0) neighflag = FULL;
|
||||
else if (strcmp(arg[iarg+1],"half") == 0) {
|
||||
if (num_threads > 1 || ngpu > 0)
|
||||
if (nthreads > 1 || ngpus > 0)
|
||||
neighflag = HALFTHREAD;
|
||||
else
|
||||
neighflag = HALF;
|
||||
|
@ -249,7 +249,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
|
|||
if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
|
||||
if (strcmp(arg[iarg+1],"full") == 0) neighflag_qeq = FULL;
|
||||
else if (strcmp(arg[iarg+1],"half") == 0) {
|
||||
if (num_threads > 1 || ngpu > 0)
|
||||
if (nthreads > 1 || ngpus > 0)
|
||||
neighflag_qeq = HALFTHREAD;
|
||||
else
|
||||
neighflag_qeq = HALF;
|
||||
|
|
|
@ -32,7 +32,7 @@ class KokkosLMP : protected Pointers {
|
|||
int exchange_comm_on_host;
|
||||
int forward_comm_on_host;
|
||||
int reverse_comm_on_host;
|
||||
int num_threads,ngpu;
|
||||
int nthreads,ngpus;
|
||||
int numa;
|
||||
int auto_sync;
|
||||
int gpu_direct_flag;
|
||||
|
|
|
@ -362,7 +362,7 @@ void NeighborKokkos::modify_mol_intra_grow_kokkos(){
|
|||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
void NeighborKokkos::set_binsize_kokkos() {
|
||||
if (!binsizeflag && lmp->kokkos->ngpu > 0) {
|
||||
if (!binsizeflag && lmp->kokkos->ngpus > 0) {
|
||||
binsize_user = cutneighmax;
|
||||
binsizeflag = 1;
|
||||
}
|
||||
|
|
|
@ -310,12 +310,12 @@ void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
|||
|
||||
#else // No atomics
|
||||
|
||||
num_threads = lmp->kokkos->num_threads;
|
||||
nthreads = lmp->kokkos->nthreads;
|
||||
int nmax = f.extent(0);
|
||||
if (nmax > t_f.extent(1)) {
|
||||
t_f = t_f_array_thread("pair_exp6_rx:t_f",num_threads,nmax);
|
||||
t_uCG = t_efloat_1d_thread("pair_exp6_rx:t_uCG",num_threads,nmax);
|
||||
t_uCGnew = t_efloat_1d_thread("pair_exp6_rx:t_UCGnew",num_threads,nmax);
|
||||
t_f = t_f_array_thread("pair_exp6_rx:t_f",nthreads,nmax);
|
||||
t_uCG = t_efloat_1d_thread("pair_exp6_rx:t_uCG",nthreads,nmax);
|
||||
t_uCGnew = t_efloat_1d_thread("pair_exp6_rx:t_UCGnew",nthreads,nmax);
|
||||
}
|
||||
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxZeroDupViews>(0,nmax),*this);
|
||||
|
@ -1642,7 +1642,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxComputeNoAtomics<NEIG
|
|||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCollapseDupViews, const int &i) const {
|
||||
for (int n = 0; n < num_threads; n++) {
|
||||
for (int n = 0; n < nthreads; n++) {
|
||||
f(i,0) += t_f(n,i,0);
|
||||
f(i,1) += t_f(n,i,1);
|
||||
f(i,2) += t_f(n,i,2);
|
||||
|
@ -1654,7 +1654,7 @@ void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCollapseDupViews, con
|
|||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxZeroDupViews, const int &i) const {
|
||||
for (int n = 0; n < num_threads; n++) {
|
||||
for (int n = 0; n < nthreads; n++) {
|
||||
t_f(n,i,0) = 0.0;
|
||||
t_f(n,i,1) = 0.0;
|
||||
t_f(n,i,2) = 0.0;
|
||||
|
|
|
@ -145,7 +145,7 @@ class PairExp6rxKokkos : public PairExp6rx {
|
|||
int eflag,vflag;
|
||||
int nlocal,newton_pair,neighflag;
|
||||
double special_lj[4];
|
||||
int num_threads,ntypes;
|
||||
int nthreads,ntypes;
|
||||
|
||||
typename AT::t_x_array_randomread x;
|
||||
typename AT::t_f_array f;
|
||||
|
|
|
@ -1656,7 +1656,7 @@ void PPPMKokkos<DeviceType>::make_rho()
|
|||
iy = nyhi_out-nylo_out + 1;
|
||||
|
||||
copymode = 1;
|
||||
Kokkos::TeamPolicy<DeviceType, TagPPPM_make_rho> config(lmp->kokkos->num_threads,1);
|
||||
Kokkos::TeamPolicy<DeviceType, TagPPPM_make_rho> config(lmp->kokkos->nthreads,1);
|
||||
Kokkos::parallel_for(config,*this);
|
||||
copymode = 0;
|
||||
#endif
|
||||
|
|
|
@ -25,7 +25,7 @@ using namespace LAMMPS_NS;
|
|||
RandPoolWrap::RandPoolWrap(int, LAMMPS *lmp) : Pointers(lmp)
|
||||
{
|
||||
random_thr = NULL;
|
||||
nthreads = lmp->kokkos->num_threads;
|
||||
nthreads = lmp->kokkos->nthreads;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
@ -59,7 +59,7 @@ void RandPoolWrap::init(RanMars* random, int seed)
|
|||
// allocate pool of RNGs
|
||||
// generate a random number generator instance for
|
||||
// all threads != 0. make sure we use unique seeds.
|
||||
nthreads = lmp->kokkos->num_threads;
|
||||
nthreads = lmp->kokkos->nthreads;
|
||||
random_thr = new RanMars*[nthreads];
|
||||
for (int tid = 1; tid < nthreads; ++tid) {
|
||||
random_thr[tid] = new RanMars(lmp, seed + comm->me
|
||||
|
|
|
@ -50,8 +50,8 @@ namespace LAMMPS_NS {
|
|||
class KokkosLMP {
|
||||
public:
|
||||
int kokkos_exists;
|
||||
int num_threads;
|
||||
int ngpu;
|
||||
int nthreads;
|
||||
int ngpus;
|
||||
int numa;
|
||||
|
||||
KokkosLMP(class LAMMPS *, int, char **) {kokkos_exists = 0;}
|
||||
|
|
|
@ -81,7 +81,7 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
|
|||
nthreads = 1;
|
||||
#ifdef _OPENMP
|
||||
if (lmp->kokkos) {
|
||||
nthreads = lmp->kokkos->num_threads * lmp->kokkos->numa;
|
||||
nthreads = lmp->kokkos->nthreads * lmp->kokkos->numa;
|
||||
} else if (getenv("OMP_NUM_THREADS") == NULL) {
|
||||
nthreads = 1;
|
||||
if (me == 0)
|
||||
|
|
|
@ -176,9 +176,9 @@ void Finish::end(int flag)
|
|||
const char fmt2[] =
|
||||
"%.1f%% CPU use with %d MPI tasks x %d OpenMP threads\n";
|
||||
if (screen) fprintf(screen,fmt2,cpu_loop,nprocs,
|
||||
lmp->kokkos->num_threads);
|
||||
lmp->kokkos->nthreads);
|
||||
if (logfile) fprintf(logfile,fmt2,cpu_loop,nprocs,
|
||||
lmp->kokkos->num_threads);
|
||||
lmp->kokkos->nthreads);
|
||||
} else {
|
||||
#if defined(_OPENMP)
|
||||
const char fmt2[] =
|
||||
|
@ -579,7 +579,7 @@ void Finish::end(int flag)
|
|||
}
|
||||
#endif
|
||||
|
||||
if (lmp->kokkos && lmp->kokkos->ngpu > 0)
|
||||
if (lmp->kokkos && lmp->kokkos->ngpus > 0)
|
||||
if (const char* env_clb = getenv("CUDA_LAUNCH_BLOCKING"))
|
||||
if (!(strcmp(env_clb,"1") == 0)) {
|
||||
error->warning(FLERR,"Timing breakdown may not be accurate "
|
||||
|
|
Loading…
Reference in New Issue