Code reformat

This commit is contained in:
Stan Moore 2019-06-06 09:59:41 -06:00
parent 708052dc81
commit f7026491f1
2 changed files with 219 additions and 184 deletions

View File

@ -231,24 +231,22 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag)
int vector_length = 32;
int atoms_per_team = 4;
int num_teams = inum/atoms_per_team + (inum%atoms_per_team?1:0);
int num_teams = inum / atoms_per_team + (inum % atoms_per_team ? 1 : 0);
Kokkos::TeamPolicy <DeviceType> policy(num_teams, atoms_per_team, vector_length);
if (neighflag == FULL){
FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(this,
atoms_per_team,
vector_length);
Kokkos::parallel_for( policy, computeH_functor );
}else if (neighflag == HALF){
FixQEqReaxKokkosComputeHFunctor<DeviceType, HALF> computeH_functor(this,
atoms_per_team,
vector_length);
Kokkos::parallel_for( policy, computeH_functor );
}else {
FixQEqReaxKokkosComputeHFunctor<DeviceType, HALFTHREAD> computeH_functor(this,
atoms_per_team,
vector_length);
Kokkos::parallel_for( policy, computeH_functor );
Kokkos::TeamPolicy<DeviceType> policy(num_teams, atoms_per_team,
vector_length);
if (neighflag == FULL) {
FixQEqReaxKokkosComputeHFunctor<DeviceType, FULL> computeH_functor(
this, atoms_per_team, vector_length);
Kokkos::parallel_for(policy, computeH_functor);
} else if (neighflag == HALF) {
FixQEqReaxKokkosComputeHFunctor<DeviceType, HALF> computeH_functor(
this, atoms_per_team, vector_length);
Kokkos::parallel_for(policy, computeH_functor);
} else {
FixQEqReaxKokkosComputeHFunctor<DeviceType, HALFTHREAD> computeH_functor(
this, atoms_per_team, vector_length);
Kokkos::parallel_for(policy, computeH_functor);
}
// init_matvec
@ -403,180 +401,208 @@ void FixQEqReaxKokkos<DeviceType>::zero_item(int ii) const
// d_numnbrs - d_numnbrs[i] contains the # of non-zero entries in the i-th row of H (which also represents the # of neighbor atoms with electrostatic interaction coefficients with atom-i)
// d_firstnbr- d_firstnbr[i] contains the beginning index from where the H matrix entries corresponding to row-i is stored in d_val
// d_jlist - contains the column index corresponding to each entry in d_val
template<class DeviceType>
template<int NEIGHFLAG>
void
FixQEqReaxKokkos<DeviceType>::compute_h_team(const typename Kokkos::TeamPolicy <DeviceType> ::member_type &team,
int atoms_per_team,
int vector_length) const{
// scratch space setup
Kokkos::View< int*, Kokkos::ScratchMemorySpace<DeviceType>, Kokkos::MemoryTraits<Kokkos::Unmanaged> > s_ilist(team.team_shmem(), atoms_per_team);
Kokkos::View< int*, Kokkos::ScratchMemorySpace<DeviceType>, Kokkos::MemoryTraits<Kokkos::Unmanaged> > s_numnbrs(team.team_shmem(), atoms_per_team);
Kokkos::View< int*, Kokkos::ScratchMemorySpace<DeviceType>, Kokkos::MemoryTraits<Kokkos::Unmanaged> > s_firstnbr(team.team_shmem(), atoms_per_team);
template <class DeviceType>
template <int NEIGHFLAG>
void FixQEqReaxKokkos<DeviceType>::compute_h_team(
const typename Kokkos::TeamPolicy<DeviceType>::member_type &team,
int atoms_per_team, int vector_length) const {
Kokkos::View< int**, Kokkos::ScratchMemorySpace<DeviceType>, Kokkos::MemoryTraits<Kokkos::Unmanaged> > s_jtype(team.team_shmem(), atoms_per_team, vector_length);
Kokkos::View< int**, Kokkos::ScratchMemorySpace<DeviceType>, Kokkos::MemoryTraits<Kokkos::Unmanaged> > s_jlist(team.team_shmem(), atoms_per_team, vector_length);
Kokkos::View< F_FLOAT**, Kokkos::ScratchMemorySpace<DeviceType>, Kokkos::MemoryTraits<Kokkos::Unmanaged> > s_r(team.team_shmem(), atoms_per_team, vector_length);
// scratch space setup
Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
s_ilist(team.team_shmem(), atoms_per_team);
Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
s_numnbrs(team.team_shmem(), atoms_per_team);
Kokkos::View<int *, Kokkos::ScratchMemorySpace<DeviceType>,
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
s_firstnbr(team.team_shmem(), atoms_per_team);
// team of threads work on atoms with index in [firstatom, lastatom)
int firstatom = team.league_rank() * atoms_per_team;
int lastatom = ( firstatom + atoms_per_team < inum ) ? ( firstatom + atoms_per_team ) : inum;
Kokkos::View<int **, Kokkos::ScratchMemorySpace<DeviceType>,
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
s_jtype(team.team_shmem(), atoms_per_team, vector_length);
Kokkos::View<int **, Kokkos::ScratchMemorySpace<DeviceType>,
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
s_jlist(team.team_shmem(), atoms_per_team, vector_length);
Kokkos::View<F_FLOAT **, Kokkos::ScratchMemorySpace<DeviceType>,
Kokkos::MemoryTraits<Kokkos::Unmanaged>>
s_r(team.team_shmem(), atoms_per_team, vector_length);
// kokkos-thread-0 is used to load info from global memory into scratch space
if(team.team_rank() == 0){
// team of threads work on atoms with index in [firstatom, lastatom)
int firstatom = team.league_rank() * atoms_per_team;
int lastatom =
(firstatom + atoms_per_team < inum) ? (firstatom + atoms_per_team) : inum;
// copy atom indices from d_ilist[firstatom:lastatom] to scratch space s_ilist[0:atoms_per_team]
// copy # of neighbor atoms for all the atoms with indices in d_ilist[firstatom:lastatom] from d_numneigh to scratch space s_numneigh[0:atoms_per_team]
// calculate total number of neighbor atoms for all atoms assigned to the current team of threads (Note - Total # of neighbor atoms here provides the
// upper bound space requirement to store the H matrix values corresponding to the atoms with indices in d_ilist[firstatom:lastatom])
// kokkos-thread-0 is used to load info from global memory into scratch space
if (team.team_rank() == 0) {
Kokkos::parallel_scan( Kokkos::ThreadVectorRange(team, atoms_per_team), [&](const int &idx, int &totalnbrs, bool final) {
int ii = firstatom + idx;
// copy atom indices from d_ilist[firstatom:lastatom] to scratch space s_ilist[0:atoms_per_team]
// copy # of neighbor atoms for all the atoms with indices in d_ilist[firstatom:lastatom] from d_numneigh to scratch space s_numneigh[0:atoms_per_team]
// calculate total number of neighbor atoms for all atoms assigned to the current team of threads (Note - Total # of neighbor atoms here provides the
// upper bound space requirement to store the H matrix values corresponding to the atoms with indices in d_ilist[firstatom:lastatom])
if(ii < inum){
const int i = d_ilist[ii];
int jnum = d_numneigh[i];
Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, atoms_per_team),
[&](const int &idx, int &totalnbrs, bool final) {
int ii = firstatom + idx;
if(final){
s_ilist[idx] = i;
s_numnbrs[idx] = jnum;
s_firstnbr[idx] = totalnbrs;
}
totalnbrs += jnum;
}else{
s_numnbrs[idx] = 0;
}
});
}
if (ii < inum) {
const int i = d_ilist[ii];
int jnum = d_numneigh[i];
if (final) {
s_ilist[idx] = i;
s_numnbrs[idx] = jnum;
s_firstnbr[idx] = totalnbrs;
}
totalnbrs += jnum;
} else {
s_numnbrs[idx] = 0;
}
});
}
// barrier ensures that the data moved to scratch space is visible to all the threads of the corresponding team
team.team_barrier();
// barrier ensures that the data moved to scratch space is visible to all the
// threads of the corresponding team
team.team_barrier();
// calculate the global memory offset from where the H matrix values to be calculated by the current team will be stored in d_val
int team_firstnbr_idx = 0;
Kokkos::single (Kokkos::PerTeam (team), [=] (int &val) {
int totalnbrs = s_firstnbr[lastatom - firstatom - 1] + s_numnbrs[lastatom - firstatom - 1];
val = Kokkos::atomic_fetch_add(&d_mfill_offset(), totalnbrs);
}, team_firstnbr_idx);
// calculate the global memory offset from where the H matrix values to be
// calculated by the current team will be stored in d_val
int team_firstnbr_idx = 0;
Kokkos::single(Kokkos::PerTeam(team),
[=](int &val) {
int totalnbrs = s_firstnbr[lastatom - firstatom - 1] +
s_numnbrs[lastatom - firstatom - 1];
val = Kokkos::atomic_fetch_add(&d_mfill_offset(), totalnbrs);
},
team_firstnbr_idx);
// map the H matrix computation of each atom to kokkos-thread (one atom per
// kokkos-thread) neighbor computation for each atom is assigned to vector
// lanes of the corresponding thread
Kokkos::parallel_for(
Kokkos::TeamThreadRange(team, atoms_per_team), [&](const int &idx) {
int ii = firstatom + idx;
// map the H matrix computation of each atom to kokkos-thread (one atom per kokkos-thread)
// neighbor computation for each atom is assigned to vector lanes of the corresponding thread
Kokkos::parallel_for( Kokkos::TeamThreadRange(team, atoms_per_team), [&] (const int &idx) {
int ii = firstatom + idx;
if (ii < inum) {
const int i = s_ilist[idx];
if(ii < inum){
const int i = s_ilist[idx];
if (mask[i] & groupbit) {
const X_FLOAT xtmp = x(i, 0);
const X_FLOAT ytmp = x(i, 1);
const X_FLOAT ztmp = x(i, 2);
const int itype = type(i);
const tagint itag = tag(i);
const int jnum = s_numnbrs[idx];
if (mask[i] & groupbit) {
const X_FLOAT xtmp = x(i,0);
const X_FLOAT ytmp = x(i,1);
const X_FLOAT ztmp = x(i,2);
const int itype = type(i);
const tagint itag = tag(i);
const int jnum = s_numnbrs[idx];
// calculate the write-offset for atom-i's first neighbor
int atomi_firstnbr_idx = team_firstnbr_idx + s_firstnbr[idx];
Kokkos::single(Kokkos::PerThread(team),
[&]() { d_firstnbr[i] = atomi_firstnbr_idx; });
// calculate the write-offset for atom-i's first neighbor
int atomi_firstnbr_idx = team_firstnbr_idx + s_firstnbr[idx];
Kokkos::single (Kokkos::PerThread (team), [&] () {
d_firstnbr[i] = atomi_firstnbr_idx;
});
// current # of neighbor atoms with non-zero electrostatic
// interaction coefficients with atom-i which represents the # of
// non-zero elements in row-i of H matrix
int atomi_nbrs_inH = 0;
// calculate H matrix values corresponding to atom-i where neighbors
// are processed in batches and the batch size is vector_length
for (int jj_start = 0; jj_start < jnum; jj_start += vector_length) {
// current # of neighbor atoms with non-zero electrostatic interaction coefficients with atom-i
// which represents the # of non-zero elements in row-i of H matrix
int atomi_nbrs_inH = 0;
int atomi_nbr_writeIdx = atomi_firstnbr_idx + atomi_nbrs_inH;
// calculate H matrix values corresponding to atom-i where neighbors are processed in batches and the batch size is vector_length
for(int jj_start = 0; jj_start < jnum; jj_start += vector_length){
// count the # of neighbor atoms with non-zero electrostatic
// interaction coefficients with atom-i in the current batch
int atomi_nbrs_curbatch = 0;
int atomi_nbr_writeIdx = atomi_firstnbr_idx + atomi_nbrs_inH;
// compute rsq, jtype, j and store in scratch space which is
// reused later
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange(team, vector_length),
[&](const int &idx, int &m_fill) {
const int jj = jj_start + idx;
// count the # of neighbor atoms with non-zero electrostatic interaction coefficients with atom-i in the current batch
int atomi_nbrs_curbatch = 0;
// initialize: -1 represents no interaction with atom-j
// where j = d_neighbors(i,jj)
s_jlist(team.team_rank(), idx) = -1;
// compute rsq, jtype, j and store in scratch space which is reused later
Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, vector_length), [&](const int &idx, int &m_fill) {
const int jj = jj_start + idx;
if (jj < jnum) {
int j = d_neighbors(i, jj);
j &= NEIGHMASK;
const int jtype = type(j);
// initialize: -1 represents no interaction with atom-j where j = d_neighbors(i,jj)
s_jlist(team.team_rank(), idx) = -1;
const X_FLOAT delx = x(j, 0) - xtmp;
const X_FLOAT dely = x(j, 1) - ytmp;
const X_FLOAT delz = x(j, 2) - ztmp;
if(jj < jnum){
int j = d_neighbors(i,jj);
j &= NEIGHMASK;
const int jtype = type(j);
// valid nbr interaction
bool valid = true;
if (NEIGHFLAG != FULL) {
// skip half of the interactions
const tagint jtag = tag(j);
if (j >= nlocal) {
if (itag > jtag) {
if ((itag + jtag) % 2 == 0)
valid = false;
} else if (itag < jtag) {
if ((itag + jtag) % 2 == 1)
valid = false;
} else {
if (x(j, 2) < ztmp)
valid = false;
if (x(j, 2) == ztmp && x(j, 1) < ytmp)
valid = false;
if (x(j, 2) == ztmp && x(j, 1) == ytmp &&
x(j, 0) < xtmp)
valid = false;
}
}
}
const X_FLOAT delx = x(j,0) - xtmp;
const X_FLOAT dely = x(j,1) - ytmp;
const X_FLOAT delz = x(j,2) - ztmp;
const F_FLOAT rsq =
delx * delx + dely * dely + delz * delz;
if (rsq > cutsq)
valid = false;
// valid nbr interaction
bool valid = true;
if (NEIGHFLAG != FULL) {
// skip half of the interactions
const tagint jtag = tag(j);
if (j >= nlocal) {
if (itag > jtag) {
if ((itag+jtag) % 2 == 0)
valid = false;
} else if (itag < jtag) {
if ((itag+jtag) % 2 == 1)
valid = false;
} else {
if (x(j,2) < ztmp)
valid = false;
if (x(j,2) == ztmp && x(j,1) < ytmp)
valid = false;
if (x(j,2) == ztmp && x(j,1) == ytmp && x(j,0) < xtmp)
valid = false;
}
}
}
if (valid) {
s_jlist(team.team_rank(), idx) = j;
s_jtype(team.team_rank(), idx) = jtype;
s_r(team.team_rank(), idx) = sqrt(rsq);
m_fill++;
}
}
},
atomi_nbrs_curbatch);
const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
if (rsq > cutsq)
valid = false;
// write non-zero entries of H to global memory
Kokkos::parallel_scan(
Kokkos::ThreadVectorRange(team, vector_length),
[&](const int &idx, int &m_fill, bool final) {
int j = s_jlist(team.team_rank(), idx);
if (final) {
if (j != -1) {
const int jtype = s_jtype(team.team_rank(), idx);
const F_FLOAT r = s_r(team.team_rank(), idx);
const F_FLOAT shldij = d_shield(itype, jtype);
if(valid){
s_jlist(team.team_rank(), idx) = j;
s_jtype(team.team_rank(), idx) = jtype;
s_r(team.team_rank(), idx) = sqrt(rsq);
m_fill++;
}
}
}, atomi_nbrs_curbatch);
d_jlist[atomi_nbr_writeIdx + m_fill] = j;
d_val[atomi_nbr_writeIdx + m_fill] =
calculate_H_k(r, shldij);
}
}
// write non-zero entries of H to global memory
Kokkos::parallel_scan( Kokkos::ThreadVectorRange(team, vector_length), [&](const int &idx, int &m_fill, bool final) {
int j = s_jlist(team.team_rank(), idx);
if(final){
if(j != -1){
const int jtype = s_jtype(team.team_rank(), idx);
const F_FLOAT r = s_r(team.team_rank(), idx);
const F_FLOAT shldij = d_shield(itype,jtype);
d_jlist[atomi_nbr_writeIdx + m_fill] = j;
d_val[atomi_nbr_writeIdx + m_fill] = calculate_H_k(r, shldij);
}
}
if(j !=-1){
m_fill++;
}
});
atomi_nbrs_inH += atomi_nbrs_curbatch;
}
Kokkos::single (Kokkos::PerThread (team), [&] () {
d_numnbrs[i] = atomi_nbrs_inH;
});
}
}
});
if (j != -1) {
m_fill++;
}
});
atomi_nbrs_inH += atomi_nbrs_curbatch;
}
Kokkos::single(Kokkos::PerThread(team),
[&]() { d_numnbrs[i] = atomi_nbrs_inH; });
}
}
});
}
/* ---------------------------------------------------------------------- */

View File

@ -259,32 +259,41 @@ struct FixQEqReaxKokkosMatVecFunctor {
};
template <class DeviceType, int NEIGHFLAG>
struct FixQEqReaxKokkosComputeHFunctor {
int atoms_per_team, vector_length;
typedef Kokkos::ScratchMemorySpace<DeviceType> scratch_space;
FixQEqReaxKokkos<DeviceType> c;
struct FixQEqReaxKokkosComputeHFunctor {
int atoms_per_team, vector_length;
typedef Kokkos::ScratchMemorySpace<DeviceType> scratch_space;
FixQEqReaxKokkos<DeviceType> c;
FixQEqReaxKokkosComputeHFunctor(FixQEqReaxKokkos<DeviceType>* c_ptr,
int _atoms_per_team,
int _vector_length):
c(*c_ptr), atoms_per_team(_atoms_per_team), vector_length(_vector_length) {
c.cleanup_copy();
};
FixQEqReaxKokkosComputeHFunctor(FixQEqReaxKokkos<DeviceType> *c_ptr,
int _atoms_per_team, int _vector_length)
: c(*c_ptr), atoms_per_team(_atoms_per_team),
vector_length(_vector_length) {
c.cleanup_copy();
};
KOKKOS_INLINE_FUNCTION
void operator()(const typename Kokkos::TeamPolicy <DeviceType> ::member_type &team) const {
c.template compute_h_team<NEIGHFLAG> (team, atoms_per_team, vector_length);
}
KOKKOS_INLINE_FUNCTION
void operator()(
const typename Kokkos::TeamPolicy<DeviceType>::member_type &team) const {
c.template compute_h_team<NEIGHFLAG>(team, atoms_per_team, vector_length);
}
size_t team_shmem_size( int team_size ) const {
size_t shmem_size = Kokkos::View<int*, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team) + // s_ilist
Kokkos::View<int*, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team) + // s_numnbrs
Kokkos::View<int*, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team) + // s_firstnbr
Kokkos::View<int**, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team, vector_length) + //s_jtype
Kokkos::View<int**, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team, vector_length) + //s_j
Kokkos::View<F_FLOAT**, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team, vector_length) ; //s_r
return shmem_size;
}
size_t team_shmem_size(int team_size) const {
size_t shmem_size =
Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
atoms_per_team) + // s_ilist
Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
atoms_per_team) + // s_numnbrs
Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
atoms_per_team) + // s_firstnbr
Kokkos::View<int **, scratch_space, Kokkos::MemoryUnmanaged>::
shmem_size(atoms_per_team, vector_length) + // s_jtype
Kokkos::View<int **, scratch_space, Kokkos::MemoryUnmanaged>::
shmem_size(atoms_per_team, vector_length) + // s_j
Kokkos::View<F_FLOAT **, scratch_space,
Kokkos::MemoryUnmanaged>::shmem_size(atoms_per_team,
vector_length); // s_r
return shmem_size;
}
};
template <class DeviceType>