Merge pull request #1571 from stanmoore1/kk_snap_opt

Add optimized version of Kokkos SNAP potential
This commit is contained in:
Axel Kohlmeyer 2019-07-19 19:10:10 -04:00 committed by GitHub
commit d52540ea31
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 529 additions and 734 deletions

View File

@ -31,10 +31,17 @@ PairStyle(snap/kk/host,PairSNAPKokkos<LMPHostType>)
namespace LAMMPS_NS { namespace LAMMPS_NS {
template<int NEIGHFLAG, int EVFLAG> template<int NEIGHFLAG, int EVFLAG>
struct TagPairSNAPCompute{}; struct TagPairSNAPComputeForce{};
struct TagPairSNAPBeta{}; struct TagPairSNAPBeta{};
struct TagPairSNAPBispectrum{}; struct TagPairSNAPComputeNeigh{};
struct TagPairSNAPPreUi{};
struct TagPairSNAPComputeUi{};
struct TagPairSNAPComputeZi{};
struct TagPairSNAPComputeBi{};
struct TagPairSNAPComputeYi{};
struct TagPairSNAPComputeDuidrj{};
struct TagPairSNAPComputeDeidrj{};
template<class DeviceType> template<class DeviceType>
class PairSNAPKokkos : public PairSNAP { class PairSNAPKokkos : public PairSNAP {
@ -56,18 +63,39 @@ public:
template<int NEIGHFLAG, int EVFLAG> template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPCompute<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPCompute<NEIGHFLAG,EVFLAG> >::member_type& team) const; void operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team) const;
template<int NEIGHFLAG, int EVFLAG> template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPCompute<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPCompute<NEIGHFLAG,EVFLAG> >::member_type& team, EV_FLOAT&) const; void operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team, EV_FLOAT&) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeigh>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeZi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeZi>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBi>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeYi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeYi>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const;
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPBeta,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPBeta>::member_type& team) const; void operator() (TagPairSNAPBeta,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPBeta>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPBispectrum,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPBispectrum>::member_type& team) const;
template<int NEIGHFLAG> template<int NEIGHFLAG>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j, void v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j,
@ -90,15 +118,10 @@ protected:
t_dbvec dbvec; t_dbvec dbvec;
SNAKokkos<DeviceType> snaKK; SNAKokkos<DeviceType> snaKK;
// How much parallelism to use within an interaction int inum,max_neighs,chunk_offset;
int vector_length,team_size;
int team_scratch_size;
int thread_scratch_size;
int eflag,vflag; int eflag,vflag;
void compute_beta();
void compute_bispectrum();
void allocate(); void allocate();
//void read_files(char *, char *); //void read_files(char *, char *);
/*template<class DeviceType> /*template<class DeviceType>
@ -131,6 +154,7 @@ inline double dist2(double* x,double* y);
Kokkos::View<F_FLOAT*, DeviceType> d_wjelem; // elements weights Kokkos::View<F_FLOAT*, DeviceType> d_wjelem; // elements weights
Kokkos::View<F_FLOAT**, Kokkos::LayoutRight, DeviceType> d_coeffelem; // element bispectrum coefficients Kokkos::View<F_FLOAT**, Kokkos::LayoutRight, DeviceType> d_coeffelem; // element bispectrum coefficients
Kokkos::View<T_INT*, DeviceType> d_map; // mapping from atom types to elements Kokkos::View<T_INT*, DeviceType> d_map; // mapping from atom types to elements
Kokkos::View<T_INT*, DeviceType> d_ninside; // ninside for all atoms in list
Kokkos::View<F_FLOAT**, DeviceType> d_beta; // betas for all atoms in list Kokkos::View<F_FLOAT**, DeviceType> d_beta; // betas for all atoms in list
Kokkos::View<F_FLOAT**, DeviceType> d_bispectrum; // bispectrum components for all atoms in list Kokkos::View<F_FLOAT**, DeviceType> d_bispectrum; // bispectrum components for all atoms in list

View File

@ -57,7 +57,6 @@ PairSNAPKokkos<DeviceType>::PairSNAPKokkos(LAMMPS *lmp) : PairSNAP(lmp)
datamask_read = EMPTY_MASK; datamask_read = EMPTY_MASK;
datamask_modify = EMPTY_MASK; datamask_modify = EMPTY_MASK;
vector_length = 8;
k_cutsq = tdual_fparams("PairSNAPKokkos::cutsq",atom->ntypes+1,atom->ntypes+1); k_cutsq = tdual_fparams("PairSNAPKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
auto d_cutsq = k_cutsq.template view<DeviceType>(); auto d_cutsq = k_cutsq.template view<DeviceType>();
rnd_cutsq = d_cutsq; rnd_cutsq = d_cutsq;
@ -164,7 +163,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
d_numneigh = k_list->d_numneigh; d_numneigh = k_list->d_numneigh;
d_neighbors = k_list->d_neighbors; d_neighbors = k_list->d_neighbors;
d_ilist = k_list->d_ilist; d_ilist = k_list->d_ilist;
int inum = list->inum; inum = list->inum;
need_dup = lmp->kokkos->need_dup<DeviceType>(); need_dup = lmp->kokkos->need_dup<DeviceType>();
if (need_dup) { if (need_dup) {
@ -181,70 +180,112 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
const int num_neighs = neighs_i.get_num_neighs(); const int num_neighs = neighs_i.get_num_neighs();
if (max_neighs<num_neighs) max_neighs = num_neighs; if (max_neighs<num_neighs) max_neighs = num_neighs;
}*/ }*/
int max_neighs = 0; max_neighs = 0;
Kokkos::parallel_reduce("PairSNAPKokkos::find_max_neighs",inum, FindMaxNumNeighs<DeviceType>(k_list), Kokkos::Experimental::Max<int>(max_neighs)); Kokkos::parallel_reduce("PairSNAPKokkos::find_max_neighs",inum, FindMaxNumNeighs<DeviceType>(k_list), Kokkos::Experimental::Max<int>(max_neighs));
snaKK.nmax = max_neighs; int vector_length = 1;
int ui_vector_length = 1;
team_scratch_size = snaKK.size_team_scratch_arrays(); int team_size = 1;
thread_scratch_size = snaKK.size_thread_scratch_arrays(); int yi_team_size = 1;
//printf("Sizes: %i %i\n",team_scratch_size/1024,thread_scratch_size/1024);
int team_size_max = Kokkos::TeamPolicy<DeviceType>::team_size_max(*this); int team_size_max = Kokkos::TeamPolicy<DeviceType>::team_size_max(*this);
vector_length = 8;
#ifdef KOKKOS_ENABLE_CUDA #ifdef KOKKOS_ENABLE_CUDA
team_size = 32;//max_neighs; team_size = 32;//max_neighs;
if (team_size*vector_length > team_size_max) if (team_size*vector_length > team_size_max)
team_size = team_size_max/vector_length; team_size = team_size_max/vector_length;
#else
team_size = 1; yi_team_size = 256;
if (yi_team_size*vector_length > team_size_max)
yi_team_size = team_size_max/vector_length;
ui_vector_length = 8;
if (team_size*ui_vector_length > team_size_max)
team_size = team_size_max/ui_vector_length;
#endif #endif
if (beta_max < list->inum) { if (beta_max < inum) {
d_beta = Kokkos::View<F_FLOAT**, DeviceType>("PairSNAPKokkos:beta", beta_max = inum;
list->inum,ncoeff); d_beta = Kokkos::View<F_FLOAT**, DeviceType>("PairSNAPKokkos:beta",inum,ncoeff);
d_bispectrum = Kokkos::View<F_FLOAT**, DeviceType>("PairSNAPKokkos:bispectrum", d_ninside = Kokkos::View<int*, DeviceType>("PairSNAPKokkos:ninside",inum);
list->inum,ncoeff);
beta_max = list->inum;
} }
// compute dE_i/dB_i = beta_i for all i in list int chunk_size = MIN(2000,inum);
chunk_offset = 0;
if (quadraticflag || eflag) snaKK.grow_rij(chunk_size,max_neighs);
compute_bispectrum();
compute_beta();
EV_FLOAT ev; EV_FLOAT ev;
while (chunk_offset < inum) { // chunk up loop to prevent running out of memory
EV_FLOAT ev_tmp;
if (chunk_size > inum - chunk_offset)
chunk_size = inum - chunk_offset;
//ComputeNeigh
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeigh> policy_neigh(chunk_size,team_size,vector_length);
Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
//PreUi
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi> policy_preui(chunk_size,team_size,vector_length);
Kokkos::parallel_for("PreUi",policy_preui,*this);
//ComputeUi
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((inum+team_size-1)/team_size)*max_neighs,team_size,ui_vector_length);
Kokkos::parallel_for("ComputeUi",policy_ui,*this);
//Compute bispectrum
if (quadraticflag || eflag) {
//ComputeZi
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeZi> policy_zi(chunk_size,team_size,vector_length);
Kokkos::parallel_for("ComputeZi",policy_zi,*this);
//ComputeBi
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBi> policy_bi(chunk_size,team_size,vector_length);
Kokkos::parallel_for("ComputeBi",policy_bi,*this);
}
//Compute beta = dE_i/dB_i for all i in list
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPBeta> policy_beta(chunk_size,team_size,vector_length);
Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
//ComputeYi
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeYi> policy_yi(chunk_size,yi_team_size,vector_length);
Kokkos::parallel_for("ComputeYi",policy_yi,*this);
//ComputeDuidrj
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj> policy_duidrj(((inum+team_size-1)/team_size)*max_neighs,team_size,vector_length);
Kokkos::parallel_for("ComputeDuidrj",policy_duidrj,*this);
//ComputeDeidrj
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj> policy_deidrj(((inum+team_size-1)/team_size)*max_neighs,team_size,vector_length);
Kokkos::parallel_for("ComputeDeidrj",policy_deidrj,*this);
//ComputeForce
if (eflag) { if (eflag) {
if (neighflag == HALF) { if (neighflag == HALF) {
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPCompute<HALF,1> > policy(inum,team_size,vector_length); typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeForce<HALF,1> > policy_force(chunk_size,team_size,vector_length);
Kokkos::parallel_reduce(policy Kokkos::parallel_reduce(policy_force
.set_scratch_size(1,Kokkos::PerThread(thread_scratch_size)) ,*this,ev_tmp);
.set_scratch_size(1,Kokkos::PerTeam(team_scratch_size))
,*this,ev);
} else if (neighflag == HALFTHREAD) { } else if (neighflag == HALFTHREAD) {
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPCompute<HALFTHREAD,1> > policy(inum,team_size,vector_length); typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeForce<HALFTHREAD,1> > policy_force(chunk_size,team_size,vector_length);
Kokkos::parallel_reduce(policy Kokkos::parallel_reduce(policy_force
.set_scratch_size(1,Kokkos::PerThread(thread_scratch_size)) ,*this,ev_tmp);
.set_scratch_size(1,Kokkos::PerTeam(team_scratch_size))
,*this,ev);
} }
} else { } else {
if (neighflag == HALF) { if (neighflag == HALF) {
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPCompute<HALF,0> > policy(inum,team_size,vector_length); typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeForce<HALF,0> > policy_force(chunk_size,team_size,vector_length);
Kokkos::parallel_for(policy Kokkos::parallel_for(policy_force
.set_scratch_size(1,Kokkos::PerThread(thread_scratch_size))
.set_scratch_size(1,Kokkos::PerTeam(team_scratch_size))
,*this); ,*this);
} else if (neighflag == HALFTHREAD) { } else if (neighflag == HALFTHREAD) {
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPCompute<HALFTHREAD,0> > policy(inum,team_size,vector_length); typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeForce<HALFTHREAD,0> > policy_force(chunk_size,team_size,vector_length);
Kokkos::parallel_for(policy Kokkos::parallel_for(policy_force
.set_scratch_size(1,Kokkos::PerThread(thread_scratch_size))
.set_scratch_size(1,Kokkos::PerTeam(team_scratch_size))
,*this); ,*this);
} }
} }
ev += ev_tmp;
chunk_offset += chunk_size;
} // end while
if (need_dup) if (need_dup)
Kokkos::Experimental::contribute(f, dup_f); Kokkos::Experimental::contribute(f, dup_f);
@ -284,32 +325,19 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
} }
} }
/* ----------------------------------------------------------------------
compute beta
------------------------------------------------------------------------- */
template<class DeviceType>
void PairSNAPKokkos<DeviceType>::compute_beta()
{
// TODO: use RangePolicy instead, or thread over ncoeff?
int inum = list->inum;
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPBeta> policy(inum,team_size,vector_length);
Kokkos::parallel_for(policy
.set_scratch_size(1,Kokkos::PerThread(thread_scratch_size))
.set_scratch_size(1,Kokkos::PerTeam(team_scratch_size))
,*this);
}
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBeta,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPBeta>::member_type& team) const { void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBeta,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPBeta>::member_type& team) const {
const int ii = team.league_rank(); // TODO: use RangePolicy instead, or thread over ncoeff?
const int i = d_ilist[ii]; int ii = team.league_rank();
const int i = d_ilist[ii + chunk_offset];
const int itype = type[i]; const int itype = type[i];
const int ielem = d_map[itype]; const int ielem = d_map[itype];
SNAKokkos<DeviceType> my_sna = snaKK;
Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>> Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
d_coeffi(d_coeffelem,ielem,Kokkos::ALL); d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
@ -319,11 +347,11 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBeta,const typename Kokk
if (quadraticflag) { if (quadraticflag) {
int k = ncoeff+1; int k = ncoeff+1;
for (int icoeff = 0; icoeff < ncoeff; icoeff++) { for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
double bveci = d_bispectrum(ii,icoeff); double bveci = my_sna.blist(ii,icoeff);
d_beta(ii,icoeff) += d_coeffi[k]*bveci; d_beta(ii,icoeff) += d_coeffi[k]*bveci;
k++; k++;
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
double bvecj = d_bispectrum(ii,jcoeff); double bvecj = my_sna.blist(ii,jcoeff);
d_beta(ii,icoeff) += d_coeffi[k]*bvecj; d_beta(ii,icoeff) += d_coeffi[k]*bvecj;
d_beta(ii,jcoeff) += d_coeffi[k]*bveci; d_beta(ii,jcoeff) += d_coeffi[k]*bveci;
k++; k++;
@ -332,105 +360,6 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBeta,const typename Kokk
} }
} }
/* ----------------------------------------------------------------------
compute bispectrum
------------------------------------------------------------------------- */
template<class DeviceType>
void PairSNAPKokkos<DeviceType>::compute_bispectrum()
{
int inum = list->inum;
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPBispectrum> policy(inum,team_size,vector_length);
Kokkos::parallel_for(policy
.set_scratch_size(1,Kokkos::PerThread(thread_scratch_size))
.set_scratch_size(1,Kokkos::PerTeam(team_scratch_size))
,*this);
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBispectrum,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPBispectrum>::member_type& team) const {
const int ii = team.league_rank();
const int i = d_ilist[ii];
SNAKokkos<DeviceType> my_sna(snaKK,team);
const double xtmp = x(i,0);
const double ytmp = x(i,1);
const double ztmp = x(i,2);
const int itype = type[i];
const int ielem = d_map[itype];
const double radi = d_radelem[ielem];
const int num_neighs = d_numneigh[i];
// rij[][3] = displacements between atom I and those neighbors
// inside = indices of neighbors of I within cutoff
// wj = weights for neighbors of I within cutoff
// rcutij = cutoffs for neighbors of I within cutoff
// note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi
int ninside = 0;
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,num_neighs),
[&] (const int jj, int& count) {
Kokkos::single(Kokkos::PerThread(team), [&] (){
T_INT j = d_neighbors(i,jj);
const F_FLOAT dx = x(j,0) - xtmp;
const F_FLOAT dy = x(j,1) - ytmp;
const F_FLOAT dz = x(j,2) - ztmp;
const int jtype = type(j);
const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
const int elem_j = d_map[jtype];
if ( rsq < rnd_cutsq(itype,jtype) )
count++;
});
},ninside);
if (team.team_rank() == 0)
Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,num_neighs),
[&] (const int jj, int& offset, bool final) {
//for (int jj = 0; jj < num_neighs; jj++) {
T_INT j = d_neighbors(i,jj);
const F_FLOAT dx = x(j,0) - xtmp;
const F_FLOAT dy = x(j,1) - ytmp;
const F_FLOAT dz = x(j,2) - ztmp;
const int jtype = type(j);
const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
const int elem_j = d_map[jtype];
if ( rsq < rnd_cutsq(itype,jtype) ) {
if (final) {
my_sna.rij(offset,0) = dx;
my_sna.rij(offset,1) = dy;
my_sna.rij(offset,2) = dz;
my_sna.inside[offset] = j;
my_sna.wj[offset] = d_wjelem[elem_j];
my_sna.rcutij[offset] = (radi + d_radelem[elem_j])*rcutfac;
}
offset++;
}
});
team.team_barrier();
// compute Ui, Zi, and Bi for atom I
my_sna.compute_ui(team,ninside);
team.team_barrier();
my_sna.compute_zi(team);
team.team_barrier();
my_sna.compute_bi(team);
team.team_barrier();
for (int icoeff = 0; icoeff < ncoeff; icoeff++)
d_bispectrum(ii,icoeff) = my_sna.blist[icoeff];
}
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
allocate all arrays allocate all arrays
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
@ -496,30 +425,21 @@ void PairSNAPKokkos<DeviceType>::coeff(int narg, char **arg)
Kokkos::deep_copy(d_coeffelem,h_coeffelem); Kokkos::deep_copy(d_coeffelem,h_coeffelem);
Kokkos::deep_copy(d_map,h_map); Kokkos::deep_copy(d_map,h_map);
// allocate memory for per OpenMP thread data which
// is wrapped into the sna class
snaKK = SNAKokkos<DeviceType>(rfac0,twojmax, snaKK = SNAKokkos<DeviceType>(rfac0,twojmax,
rmin0,switchflag,bzeroflag); rmin0,switchflag,bzeroflag);
snaKK.grow_rij(0); snaKK.grow_rij(0,0);
snaKK.init(); snaKK.init();
} }
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
template<class DeviceType> template<class DeviceType>
template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPCompute<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPCompute<NEIGHFLAG,EVFLAG> >::member_type& team, EV_FLOAT& ev) const { void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeigh>::member_type& team) const {
// The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial int ii = team.league_rank();
const int i = d_ilist[ii + chunk_offset];
auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); SNAKokkos<DeviceType> my_sna = snaKK;
auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>();
const int ii = team.league_rank();
const int i = d_ilist[ii];
SNAKokkos<DeviceType> my_sna(snaKK,team);
const double xtmp = x(i,0); const double xtmp = x(i,0);
const double ytmp = x(i,1); const double ytmp = x(i,1);
const double ztmp = x(i,2); const double ztmp = x(i,2);
@ -553,6 +473,8 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPCompute<NEIGHFLAG,EVFLAG
}); });
},ninside); },ninside);
d_ninside(ii) = ninside;
if (team.team_rank() == 0) if (team.team_rank() == 0)
Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,num_neighs), Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,num_neighs),
[&] (const int jj, int& offset, bool final) { [&] (const int jj, int& offset, bool final) {
@ -568,43 +490,124 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPCompute<NEIGHFLAG,EVFLAG
if ( rsq < rnd_cutsq(itype,jtype) ) { if ( rsq < rnd_cutsq(itype,jtype) ) {
if (final) { if (final) {
my_sna.rij(offset,0) = dx; my_sna.rij(ii,offset,0) = dx;
my_sna.rij(offset,1) = dy; my_sna.rij(ii,offset,1) = dy;
my_sna.rij(offset,2) = dz; my_sna.rij(ii,offset,2) = dz;
my_sna.inside[offset] = j; my_sna.inside(ii,offset) = j;
my_sna.wj[offset] = d_wjelem[elem_j]; my_sna.wj(ii,offset) = d_wjelem[elem_j];
my_sna.rcutij[offset] = (radi + d_radelem[elem_j])*rcutfac; my_sna.rcutij(ii,offset) = (radi + d_radelem[elem_j])*rcutfac;
} }
offset++; offset++;
} }
}); });
team.team_barrier(); }
// compute Ui, Yi for atom I template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const {
int ii = team.league_rank();
SNAKokkos<DeviceType> my_sna = snaKK;
my_sna.pre_ui(team,ii);
}
my_sna.compute_ui(team,ninside); template<class DeviceType>
team.team_barrier(); KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const {
SNAKokkos<DeviceType> my_sna = snaKK;
// for neighbors of I within cutoff: // Extract the atom number
// compute Fij = dEi/dRj = -dEi/dRi int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((inum+team.team_size()-1)/team.team_size()));
// add to Fi, subtract from Fj if (ii >= inum) return;
my_sna.compute_yi(team,d_beta,ii); // Extract the neighbor number
team.team_barrier(); const int jj = team.league_rank() / ((inum+team.team_size()-1)/team.team_size());
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>> my_sna.compute_ui(team,ii,jj);
d_coeffi(d_coeffelem,ielem,Kokkos::ALL); }
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeYi>::member_type& team) const {
int ii = team.league_rank();
SNAKokkos<DeviceType> my_sna = snaKK;
my_sna.compute_yi(team,ii,d_beta);
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeZi>::member_type& team) const {
int ii = team.league_rank();
SNAKokkos<DeviceType> my_sna = snaKK;
my_sna.compute_zi(team,ii);
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBi>::member_type& team) const {
int ii = team.league_rank();
SNAKokkos<DeviceType> my_sna = snaKK;
my_sna.compute_bi(team,ii);
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const {
SNAKokkos<DeviceType> my_sna = snaKK;
// Extract the atom number
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((inum+team.team_size()-1)/team.team_size()));
if (ii >= inum) return;
// Extract the neighbor number
const int jj = team.league_rank() / ((inum+team.team_size()-1)/team.team_size());
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
my_sna.compute_duidrj(team,ii,jj);
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const {
SNAKokkos<DeviceType> my_sna = snaKK;
// Extract the atom number
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((inum+team.team_size()-1)/team.team_size()));
if (ii >= inum) return;
// Extract the neighbor number
const int jj = team.league_rank() / ((inum+team.team_size()-1)/team.team_size());
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
my_sna.compute_deidrj(team,ii,jj);
}
template<class DeviceType>
template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team, EV_FLOAT& ev) const {
// The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f);
auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>();
int ii = team.league_rank();
const int i = d_ilist[ii + chunk_offset];
SNAKokkos<DeviceType> my_sna = snaKK;
const int ninside = d_ninside(ii);
Kokkos::parallel_for (Kokkos::TeamThreadRange(team,ninside), Kokkos::parallel_for (Kokkos::TeamThreadRange(team,ninside),
[&] (const int jj) { [&] (const int jj) {
//for (int jj = 0; jj < ninside; jj++) { int j = my_sna.inside(ii,jj);
int j = my_sna.inside[jj];
my_sna.compute_duidrj(team,&my_sna.rij(jj,0),
my_sna.wj[jj],my_sna.rcutij[jj],jj);
F_FLOAT fij[3]; F_FLOAT fij[3];
my_sna.compute_deidrj(team,fij); fij[0] = my_sna.dedr(ii,jj,0);
fij[1] = my_sna.dedr(ii,jj,1);
fij[2] = my_sna.dedr(ii,jj,2);
Kokkos::single(Kokkos::PerThread(team), [&] (){ Kokkos::single(Kokkos::PerThread(team), [&] (){
a_f(i,0) += fij[0]; a_f(i,0) += fij[0];
@ -620,8 +623,8 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPCompute<NEIGHFLAG,EVFLAG
if (vflag_either) { if (vflag_either) {
v_tally_xyz<NEIGHFLAG>(ev,i,j, v_tally_xyz<NEIGHFLAG>(ev,i,j,
fij[0],fij[1],fij[2], fij[0],fij[1],fij[2],
-my_sna.rij(jj,0),-my_sna.rij(jj,1), -my_sna.rij(ii,jj,0),-my_sna.rij(ii,jj,1),
-my_sna.rij(jj,2)); -my_sna.rij(ii,jj,2));
} }
} }
@ -633,6 +636,11 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPCompute<NEIGHFLAG,EVFLAG
if (EVFLAG) { if (EVFLAG) {
if (eflag_either) { if (eflag_either) {
const int itype = type(i);
const int ielem = d_map[itype];
Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
Kokkos::single(Kokkos::PerTeam(team), [&] () { Kokkos::single(Kokkos::PerTeam(team), [&] () {
// evdwl = energy of atom I, sum over coeffs_k * Bi_k // evdwl = energy of atom I, sum over coeffs_k * Bi_k
@ -644,17 +652,17 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPCompute<NEIGHFLAG,EVFLAG
// linear contributions // linear contributions
for (int icoeff = 0; icoeff < ncoeff; icoeff++) for (int icoeff = 0; icoeff < ncoeff; icoeff++)
evdwl += d_coeffi[icoeff+1]*d_bispectrum(ii,icoeff); evdwl += d_coeffi[icoeff+1]*my_sna.blist(ii,icoeff);
// quadratic contributions // quadratic contributions
if (quadraticflag) { if (quadraticflag) {
int k = ncoeff+1; int k = ncoeff+1;
for (int icoeff = 0; icoeff < ncoeff; icoeff++) { for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
double bveci = d_bispectrum(ii,icoeff); double bveci = my_sna.blist(ii,icoeff);
evdwl += 0.5*d_coeffi[k++]*bveci*bveci; evdwl += 0.5*d_coeffi[k++]*bveci*bveci;
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
double bvecj = d_bispectrum(ii,jcoeff); double bvecj = my_sna.blist(ii,jcoeff);
evdwl += d_coeffi[k++]*bveci*bvecj; evdwl += d_coeffi[k++]*bveci*bvecj;
} }
} }
@ -671,9 +679,9 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPCompute<NEIGHFLAG,EVFLAG
template<class DeviceType> template<class DeviceType>
template<int NEIGHFLAG, int EVFLAG> template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPCompute<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPCompute<NEIGHFLAG,EVFLAG> >::member_type& team) const { void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team) const {
EV_FLOAT ev; EV_FLOAT ev;
this->template operator()<NEIGHFLAG,EVFLAG>(TagPairSNAPCompute<NEIGHFLAG,EVFLAG>(), team, ev); this->template operator()<NEIGHFLAG,EVFLAG>(TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>(), team, ev);
} }
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
@ -738,4 +746,5 @@ double PairSNAPKokkos<DeviceType>::memory_usage()
bytes += snaKK.memory_usage(); bytes += snaKK.memory_usage();
return bytes; return bytes;
} }
} }

View File

@ -25,6 +25,9 @@
namespace LAMMPS_NS { namespace LAMMPS_NS {
typedef double SNAreal;
typedef struct { SNAreal re, im; } SNAcomplex;
struct SNAKK_ZINDICES { struct SNAKK_ZINDICES {
int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju; int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju;
}; };
@ -39,12 +42,22 @@ class SNAKokkos {
public: public:
typedef Kokkos::View<int*, DeviceType> t_sna_1i; typedef Kokkos::View<int*, DeviceType> t_sna_1i;
typedef Kokkos::View<double*, DeviceType> t_sna_1d; typedef Kokkos::View<double*, DeviceType> t_sna_1d;
typedef Kokkos::View<double*, Kokkos::LayoutRight, DeviceType, Kokkos::MemoryTraits<Kokkos::Atomic> > t_sna_1d_atomic; typedef Kokkos::View<double*, DeviceType, Kokkos::MemoryTraits<Kokkos::Atomic> > t_sna_1d_atomic;
typedef Kokkos::View<double**, Kokkos::LayoutRight, DeviceType> t_sna_2d; typedef Kokkos::View<int**, DeviceType> t_sna_2i;
typedef Kokkos::View<double***, Kokkos::LayoutRight, DeviceType> t_sna_3d; typedef Kokkos::View<double**, DeviceType> t_sna_2d;
typedef Kokkos::View<double***[3], Kokkos::LayoutRight, DeviceType> t_sna_4d; typedef Kokkos::View<double***, DeviceType> t_sna_3d;
typedef Kokkos::View<double**[3], Kokkos::LayoutRight, DeviceType> t_sna_3d3; typedef Kokkos::View<double***[3], DeviceType> t_sna_4d;
typedef Kokkos::View<double*****, Kokkos::LayoutRight, DeviceType> t_sna_5d; typedef Kokkos::View<double**[3], DeviceType> t_sna_3d3;
typedef Kokkos::View<double*****, DeviceType> t_sna_5d;
typedef Kokkos::View<SNAcomplex*, DeviceType> t_sna_1c;
typedef Kokkos::View<SNAcomplex*, DeviceType, Kokkos::MemoryTraits<Kokkos::Atomic> > t_sna_1c_atomic;
typedef Kokkos::View<SNAcomplex**, DeviceType> t_sna_2c;
typedef Kokkos::View<SNAcomplex**, Kokkos::LayoutRight, DeviceType> t_sna_2c_cpu;
typedef Kokkos::View<SNAcomplex***, DeviceType> t_sna_3c;
typedef Kokkos::View<SNAcomplex***[3], DeviceType> t_sna_4c;
typedef Kokkos::View<SNAcomplex**[3], DeviceType> t_sna_3c3;
typedef Kokkos::View<SNAcomplex*****, DeviceType> t_sna_5c;
inline inline
SNAKokkos() {}; SNAKokkos() {};
@ -63,36 +76,31 @@ inline
inline inline
void init(); // void init(); //
inline
T_INT size_team_scratch_arrays();
inline
T_INT size_thread_scratch_arrays();
double memory_usage(); double memory_usage();
int ncoeff; int ncoeff;
// functions for bispectrum coefficients // functions for bispectrum coefficients
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int); // ForceSNAP void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int); // ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_zi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team); // ForceSNAP void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_yi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, void compute_ui_orig(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
const Kokkos::View<F_FLOAT**, DeviceType> &beta, const int ii); // ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_bi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team); // ForceSNAP void compute_zi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_yi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int,
const Kokkos::View<F_FLOAT**, DeviceType> &beta); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_bi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int); // ForceSNAP
// functions for derivatives // functions for derivatives
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_duidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, double*, double, double, int); //ForceSNAP void compute_duidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_dbidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team); //ForceSNAP void compute_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, double *); // ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
double compute_sfac(double, double); // add_uarraytot, compute_duarray double compute_sfac(double, double); // add_uarraytot, compute_duarray
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
@ -109,29 +117,26 @@ inline
// Per InFlight Particle // Per InFlight Particle
t_sna_2d rij; t_sna_3d rij;
t_sna_1i inside; t_sna_2i inside;
t_sna_1d wj; t_sna_2d wj;
t_sna_1d rcutij; t_sna_2d rcutij;
int nmax; t_sna_3d dedr;
int natom, nmax;
void grow_rij(int); void grow_rij(int, int);
int twojmax, diagonalstyle; int twojmax, diagonalstyle;
// Per InFlight Particle
t_sna_1d blist;
t_sna_1d ulisttot_r, ulisttot_i;
t_sna_1d_atomic ulisttot_r_a, ulisttot_i_a;
t_sna_1d zlist_r, zlist_i;
t_sna_2d ulist_r_ij, ulist_i_ij;
// Per InFlight Interaction t_sna_2d blist;
t_sna_1d ulist_r, ulist_i; t_sna_2c_cpu ulisttot;
t_sna_1d_atomic ylist_r, ylist_i; t_sna_2c zlist;
t_sna_3c ulist;
t_sna_2c ylist;
// derivatives of data // derivatives of data
t_sna_2d dulist_r, dulist_i; t_sna_4c dulist;
t_sna_2d dblist;
private: private:
double rmin0, rfac0; double rmin0, rfac0;
@ -168,14 +173,14 @@ inline
inline inline
void init_rootpqarray(); // init() void init_rootpqarray(); // init()
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void zero_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team); // compute_ui void zero_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int); // compute_ui
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void addself_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, double); // compute_ui void addself_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, double); // compute_ui
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, double, double, double, int); // compute_ui void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, double, double, double); // compute_ui
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_uarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, void compute_uarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
double, double, double, double, double, double,
double, double); // compute_ui double, double); // compute_ui
inline inline
@ -184,9 +189,9 @@ inline
inline inline
int compute_ncoeff(); // SNAKokkos() int compute_ncoeff(); // SNAKokkos()
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_duarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, void compute_duarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
double, double, double, // compute_duidrj double, double, double, // compute_duidrj
double, double, double, double, double, int); double, double, double, double, double);
// Sets the style for the switching function // Sets the style for the switching function
// 0 = none // 0 = none
@ -197,7 +202,7 @@ inline
double wself; double wself;
int bzero_flag; // 1 if bzero subtracted from barray int bzero_flag; // 1 if bzero subtracted from barray
Kokkos::View<double*, Kokkos::LayoutRight, DeviceType> bzero; // array of B values for isolated atoms Kokkos::View<double*, DeviceType> bzero; // array of B values for isolated atoms
}; };
} }

View File

@ -41,8 +41,6 @@ SNAKokkos<DeviceType>::SNAKokkos(double rfac0_in,
ncoeff = compute_ncoeff(); ncoeff = compute_ncoeff();
//create_twojmax_arrays();
nmax = 0; nmax = 0;
build_indexlist(); build_indexlist();
@ -63,37 +61,6 @@ SNAKokkos<DeviceType>::SNAKokkos(double rfac0_in,
} }
} }
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
SNAKokkos<DeviceType>::SNAKokkos(const SNAKokkos<DeviceType>& sna, const typename Kokkos::TeamPolicy<DeviceType>::member_type& team) {
wself = sna.wself;
rfac0 = sna.rfac0;
rmin0 = sna.rmin0;
switch_flag = sna.switch_flag;
bzero_flag = sna.bzero_flag;
twojmax = sna.twojmax;
ncoeff = sna.ncoeff;
nmax = sna.nmax;
idxz = sna.idxz;
idxb = sna.idxb;
idxcg_max = sna.idxcg_max;
idxu_max = sna.idxu_max;
idxz_max = sna.idxz_max;
idxb_max = sna.idxb_max;
idxcg_block = sna.idxcg_block;
idxu_block = sna.idxu_block;
idxz_block = sna.idxz_block;
idxb_block = sna.idxb_block;
cglist = sna.cglist;
rootpqarray = sna.rootpqarray;
bzero = sna.bzero;
create_team_scratch_arrays(team);
create_thread_scratch_arrays(team);
}
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
template<class DeviceType> template<class DeviceType>
@ -245,18 +212,80 @@ void SNAKokkos<DeviceType>::init()
template<class DeviceType> template<class DeviceType>
inline inline
void SNAKokkos<DeviceType>::grow_rij(int newnmax) void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
{ {
if(newnmax <= nmax) return; if(newnatom <= natom && newnmax <= nmax) return;
natom = newnatom;
nmax = newnmax; nmax = newnmax;
rij = t_sna_3d("sna:rij",natom,nmax,3);
inside = t_sna_2i("sna:inside",natom,nmax);
wj = t_sna_2d("sna:wj",natom,nmax);
rcutij = t_sna_2d("sna:rcutij",natom,nmax);
dedr = t_sna_3d("sna:dedr",natom,nmax,3);
blist = t_sna_2d("sna:blist",natom,idxb_max);
ulisttot = t_sna_2c_cpu("sna:ulisttot",natom,idxu_max);
zlist = t_sna_2c("sna:zlist",natom,idxz_max);
ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max);
ylist = t_sna_2c("sna:ylist",natom,idxu_max);
dulist = t_sna_4c("sna:dulist",natom,nmax,idxu_max);
} }
/* ----------------------------------------------------------------------
* compute Ui by summing over neighbors j
* ------------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
{
if(team.team_rank() == 0) {
zero_uarraytot(team,iatom);
//Kokkos::single(Kokkos::PerThread(team), [&] (){
addself_uarraytot(team,iatom,wself);
//});
}
team.team_barrier();
}
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
compute Ui by summing over neighbors j compute Ui by summing over neighbors j
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int jnum) void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
{
double rsq, r, x, y, z, z0, theta0;
// utot(j,ma,mb) = 0 for all j,ma,ma
// utot(j,ma,ma) = 1 for all j,ma
// for j in neighbors of i:
// compute r0 = (x,y,z,z0)
// utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
x = rij(iatom,jnbor,0);
y = rij(iatom,jnbor,1);
z = rij(iatom,jnbor,2);
rsq = x * x + y * y + z * z;
r = sqrt(rsq);
theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
// theta0 = (r - rmin0) * rscale0;
z0 = r / tan(theta0);
compute_uarray(team, iatom, jnbor, x, y, z, z0, r);
//Kokkos::single(Kokkos::PerThread(team), [&] (){
add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor));
//});
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_ui_orig(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnum)
{ {
double rsq, r, x, y, z, z0, theta0; double rsq, r, x, y, z, z0, theta0;
@ -267,9 +296,9 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
// utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
if(team.team_rank() == 0) { if(team.team_rank() == 0) {
zero_uarraytot(team); zero_uarraytot(team,iatom);
//Kokkos::single(Kokkos::PerThread(team), [&] (){ //Kokkos::single(Kokkos::PerThread(team), [&] (){
addself_uarraytot(team,wself); addself_uarraytot(team,iatom,wself);
//}); //});
} }
team.team_barrier(); team.team_barrier();
@ -277,19 +306,19 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,jnum), Kokkos::parallel_for(Kokkos::TeamThreadRange(team,jnum),
[&] (const int& j) { [&] (const int& j) {
//for(int j = 0; j < jnum; j++) { //for(int j = 0; j < jnum; j++) {
x = rij(j,0); x = rij(iatom,j,0);
y = rij(j,1); y = rij(iatom,j,1);
z = rij(j,2); z = rij(iatom,j,2);
rsq = x * x + y * y + z * z; rsq = x * x + y * y + z * z;
r = sqrt(rsq); r = sqrt(rsq);
theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij[j] - rmin0); theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,j) - rmin0);
// theta0 = (r - rmin0) * rscale0; // theta0 = (r - rmin0) * rscale0;
z0 = r / tan(theta0); z0 = r / tan(theta0);
compute_uarray(team,x, y, z, z0, r); compute_uarray(team, iatom, j, x, y, z, z0, r);
//Kokkos::single(Kokkos::PerThread(team), [&] (){ //Kokkos::single(Kokkos::PerThread(team), [&] (){
add_uarraytot(team,r, wj[j], rcutij[j], j); add_uarraytot(team, iatom, j, r, wj(iatom,j), rcutij(iatom,j));
//}); //});
}); });
@ -301,7 +330,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_zi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team) void SNAKokkos<DeviceType>::compute_zi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
{ {
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxz_max), Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxz_max),
[&] (const int& jjz) { [&] (const int& jjz) {
@ -318,8 +347,8 @@ void SNAKokkos<DeviceType>::compute_zi(const typename Kokkos::TeamPolicy<DeviceT
const double* cgblock = cglist.data() + idxcg_block(j1,j2,j); const double* cgblock = cglist.data() + idxcg_block(j1,j2,j);
zlist_r[jjz] = 0.0; zlist(iatom,jjz).re = 0.0;
zlist_i[jjz] = 0.0; zlist(iatom,jjz).im = 0.0;
int jju1 = idxu_block[j1] + (j1+1)*mb1min; int jju1 = idxu_block[j1] + (j1+1)*mb1min;
int jju2 = idxu_block[j2] + (j2+1)*mb2max; int jju2 = idxu_block[j2] + (j2+1)*mb2max;
@ -329,24 +358,19 @@ void SNAKokkos<DeviceType>::compute_zi(const typename Kokkos::TeamPolicy<DeviceT
double suma1_r = 0.0; double suma1_r = 0.0;
double suma1_i = 0.0; double suma1_i = 0.0;
const double* u1_r = ulisttot_r.data() + jju1;
const double* u1_i = ulisttot_i.data() + jju1;
const double* u2_r = ulisttot_r.data() + jju2;
const double* u2_i = ulisttot_i.data() + jju2;
int ma1 = ma1min; int ma1 = ma1min;
int ma2 = ma2max; int ma2 = ma2max;
int icga = ma1min*(j2+1) + ma2max; int icga = ma1min*(j2+1) + ma2max;
for(int ia = 0; ia < na; ia++) { for(int ia = 0; ia < na; ia++) {
suma1_r += cgblock[icga] * (u1_r[ma1] * u2_r[ma2] - u1_i[ma1] * u2_i[ma2]); suma1_r += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).re - ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).im);
suma1_i += cgblock[icga] * (u1_r[ma1] * u2_i[ma2] + u1_i[ma1] * u2_r[ma2]); suma1_i += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).im + ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).re);
ma1++; ma1++;
ma2--; ma2--;
icga += j2; icga += j2;
} // end loop over ia } // end loop over ia
zlist_r[jjz] += cgblock[icgb] * suma1_r; zlist(iatom,jjz).re += cgblock[icgb] * suma1_r;
zlist_i[jjz] += cgblock[icgb] * suma1_i; zlist(iatom,jjz).im += cgblock[icgb] * suma1_i;
jju1 += j1+1; jju1 += j1+1;
jju2 -= j2+1; jju2 -= j2+1;
@ -362,26 +386,22 @@ void SNAKokkos<DeviceType>::compute_zi(const typename Kokkos::TeamPolicy<DeviceT
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_yi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, void SNAKokkos<DeviceType>::compute_yi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom,
const Kokkos::View<F_FLOAT**, DeviceType> &beta, const int ii) const Kokkos::View<F_FLOAT**, DeviceType> &beta)
{ {
double betaj; double betaj;
const int ii = iatom;
{ {
double* const ptr = ylist_r.data(); Kokkos::parallel_for(Kokkos::TeamThreadRange(team,ylist.extent(1)),
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,ylist_r.span()),
[&] (const int& i) { [&] (const int& i) {
ptr[i] = 0.0; ylist(iatom,i).re = 0.0;
}); ylist(iatom,i).im = 0.0;
}
{
double* const ptr = ylist_i.data();
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,ylist_i.span()),
[&] (const int& i) {
ptr[i] = 0.0;
}); });
} }
//int flopsum = 0;
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxz_max), Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxz_max),
[&] (const int& jjz) { [&] (const int& jjz) {
//for(int jjz = 0; jjz < idxz_max; jjz++) { //for(int jjz = 0; jjz < idxz_max; jjz++) {
@ -410,18 +430,15 @@ void SNAKokkos<DeviceType>::compute_yi(const typename Kokkos::TeamPolicy<DeviceT
double suma1_r = 0.0; double suma1_r = 0.0;
double suma1_i = 0.0; double suma1_i = 0.0;
const double* u1_r = ulisttot_r.data() + jju1;
const double* u1_i = ulisttot_i.data() + jju1;
const double* u2_r = ulisttot_r.data() + jju2;
const double* u2_i = ulisttot_i.data() + jju2;
int ma1 = ma1min; int ma1 = ma1min;
int ma2 = ma2max; int ma2 = ma2max;
int icga = ma1min*(j2+1) + ma2max; int icga = ma1min*(j2+1) + ma2max;
for(int ia = 0; ia < na; ia++) { for(int ia = 0; ia < na; ia++) {
suma1_r += cgblock[icga] * (u1_r[ma1] * u2_r[ma2] - u1_i[ma1] * u2_i[ma2]); suma1_r += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).re - ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).im);
suma1_i += cgblock[icga] * (u1_r[ma1] * u2_i[ma2] + u1_i[ma1] * u2_r[ma2]); ma1++; suma1_i += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).im + ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).re);
//flopsum += 10;
ma1++;
ma2--; ma2--;
icga += j2; icga += j2;
} // end loop over ia } // end loop over ia
@ -458,11 +475,13 @@ void SNAKokkos<DeviceType>::compute_yi(const typename Kokkos::TeamPolicy<DeviceT
} }
Kokkos::single(Kokkos::PerThread(team), [&] () { Kokkos::single(Kokkos::PerThread(team), [&] () {
ylist_r[jju] += betaj*ztmp_r; Kokkos::atomic_add(&(ylist(iatom,jju).re), betaj*ztmp_r);
ylist_i[jju] += betaj*ztmp_i; Kokkos::atomic_add(&(ylist(iatom,jju).im), betaj*ztmp_i);
}); });
}); // end loop over jjz }); // end loop over jjz
//printf("sum %i\n",flopsum);
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -471,7 +490,7 @@ void SNAKokkos<DeviceType>::compute_yi(const typename Kokkos::TeamPolicy<DeviceT
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, double* dedr) void SNAKokkos<DeviceType>::compute_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
{ {
t_scalar3<double> sum; t_scalar3<double> sum;
@ -482,9 +501,9 @@ void SNAKokkos<DeviceType>::compute_deidrj(const typename Kokkos::TeamPolicy<Dev
for(int mb = 0; 2*mb < j; mb++) for(int mb = 0; 2*mb < j; mb++)
for(int ma = 0; ma <= j; ma++) { for(int ma = 0; ma <= j; ma++) {
sum_tmp.x += dulist_r(jju,0) * ylist_r[jju] + dulist_i(jju,0) * ylist_i[jju]; sum_tmp.x += dulist(iatom,jnbor,jju,0).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,0).im * ylist(iatom,jju).im;
sum_tmp.y += dulist_r(jju,1) * ylist_r[jju] + dulist_i(jju,1) * ylist_i[jju]; sum_tmp.y += dulist(iatom,jnbor,jju,1).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,1).im * ylist(iatom,jju).im;
sum_tmp.z += dulist_r(jju,2) * ylist_r[jju] + dulist_i(jju,2) * ylist_i[jju]; sum_tmp.z += dulist(iatom,jnbor,jju,2).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,2).im * ylist(iatom,jju).im;
jju++; jju++;
} //end loop over ma mb } //end loop over ma mb
@ -494,24 +513,25 @@ void SNAKokkos<DeviceType>::compute_deidrj(const typename Kokkos::TeamPolicy<Dev
int mb = j/2; int mb = j/2;
for(int ma = 0; ma < mb; ma++) { for(int ma = 0; ma < mb; ma++) {
sum_tmp.x += dulist_r(jju,0) * ylist_r[jju] + dulist_i(jju,0) * ylist_i[jju]; sum_tmp.x += dulist(iatom,jnbor,jju,0).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,0).im * ylist(iatom,jju).im;
sum_tmp.y += dulist_r(jju,1) * ylist_r[jju] + dulist_i(jju,1) * ylist_i[jju]; sum_tmp.y += dulist(iatom,jnbor,jju,1).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,1).im * ylist(iatom,jju).im;
sum_tmp.z += dulist_r(jju,2) * ylist_r[jju] + dulist_i(jju,2) * ylist_i[jju]; sum_tmp.z += dulist(iatom,jnbor,jju,2).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,2).im * ylist(iatom,jju).im;
jju++; jju++;
} }
//int ma = mb; //int ma = mb;
sum_tmp.x += (dulist_r(jju,0) * ylist_r[jju] + dulist_i(jju,0) * ylist_i[jju])*0.5; sum_tmp.x += (dulist(iatom,jnbor,jju,0).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,0).im * ylist(iatom,jju).im)*0.5;
sum_tmp.y += (dulist_r(jju,1) * ylist_r[jju] + dulist_i(jju,1) * ylist_i[jju])*0.5; sum_tmp.y += (dulist(iatom,jnbor,jju,1).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,1).im * ylist(iatom,jju).im)*0.5;
sum_tmp.z += (dulist_r(jju,2) * ylist_r[jju] + dulist_i(jju,2) * ylist_i[jju])*0.5; sum_tmp.z += (dulist(iatom,jnbor,jju,2).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,2).im * ylist(iatom,jju).im)*0.5;
} // end if jeven } // end if jeven
},sum); // end loop over j },sum); // end loop over j
//}
Kokkos::single(Kokkos::PerThread(team), [&] () { Kokkos::single(Kokkos::PerThread(team), [&] () {
dedr[0] = sum.x*2.0; dedr(iatom,jnbor,0) = sum.x*2.0;
dedr[1] = sum.y*2.0; dedr(iatom,jnbor,1) = sum.y*2.0;
dedr[2] = sum.z*2.0; dedr(iatom,jnbor,2) = sum.z*2.0;
}); });
} }
@ -522,7 +542,7 @@ void SNAKokkos<DeviceType>::compute_deidrj(const typename Kokkos::TeamPolicy<Dev
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team) void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
{ {
// for j1 = 0,...,twojmax // for j1 = 0,...,twojmax
// for j2 = 0,twojmax // for j2 = 0,twojmax
@ -555,8 +575,8 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
const int jjz_index = jjz+mb*(j+1)+ma; const int jjz_index = jjz+mb*(j+1)+ma;
if (2*mb == j) return; if (2*mb == j) return;
sum += sum +=
ulisttot_r(jju_index) * zlist_r(jjz_index) + ulisttot(iatom,jju_index).re * zlist(iatom,jjz_index).re +
ulisttot_i(jju_index) * zlist_i(jjz_index); ulisttot(iatom,jju_index).im * zlist(iatom,jjz_index).im;
},sumzu_temp); // end loop over ma, mb },sumzu_temp); // end loop over ma, mb
sumzu += sumzu_temp; sumzu += sumzu_temp;
@ -570,8 +590,8 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
sum += sum +=
ulisttot_r(jju_index) * zlist_r(jjz_index) + ulisttot(iatom,jju_index).re * zlist(iatom,jjz_index).re +
ulisttot_i(jju_index) * zlist_i(jjz_index); ulisttot(iatom,jju_index).im * zlist(iatom,jjz_index).im;
},sumzu_temp); // end loop over ma },sumzu_temp); // end loop over ma
sumzu += sumzu_temp; sumzu += sumzu_temp;
@ -579,8 +599,8 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
sumzu += 0.5* sumzu += 0.5*
(ulisttot_r(jju_index) * zlist_r(jjz_index) + (ulisttot(iatom,jju_index).re * zlist(iatom,jjz_index).re +
ulisttot_i(jju_index) * zlist_i(jjz_index)); ulisttot(iatom,jju_index).im * zlist(iatom,jjz_index).im);
} // end if jeven } // end if jeven
Kokkos::single(Kokkos::PerThread(team), [&] () { Kokkos::single(Kokkos::PerThread(team), [&] () {
@ -591,229 +611,50 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
if (bzero_flag) if (bzero_flag)
sumzu -= bzero[j]; sumzu -= bzero[j];
blist(jjb) = sumzu; blist(iatom,jjb) = sumzu;
}); });
}); });
//} // end loop over j //} // end loop over j
//} // end loop over j1, j2 //} // end loop over j1, j2
} }
/* ----------------------------------------------------------------------
calculate derivative of Bi w.r.t. atom j
variant using indexlist for j1,j2,j
variant using symmetry relation
------------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_dbidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team)
{
// for j1 = 0,...,twojmax
// for j2 = 0,twojmax
// for j = |j1-j2|,Min(twojmax,j1+j2),2
// zdb = 0
// for mb = 0,...,jmid
// for ma = 0,...,j
// zdb +=
// Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb)
// dbdr(j1,j2,j) += 2*zdb
// zdb = 0
// for mb1 = 0,...,j1mid
// for ma1 = 0,...,j1
// zdb +=
// Conj(dudr(j1,ma1,mb1))*z(j,j2,j1,ma1,mb1)
// dbdr(j1,j2,j) += 2*zdb*(j+1)/(j1+1)
// zdb = 0
// for mb2 = 0,...,j2mid
// for ma2 = 0,...,j2
// zdb +=
// Conj(dudr(j2,ma2,mb2))*z(j1,j,j2,ma2,mb2)
// dbdr(j1,j2,j) += 2*zdb*(j+1)/(j2+1)
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,idxb_max),
[&] (const int& jjb) {
//for(int jjb = 0; jjb < idxb_max; jjb++) {
const int j1 = idxb[jjb].j1;
const int j2 = idxb[jjb].j2;
const int j = idxb[jjb].j;
// dbdr = dblist(jjb);
// dbdr[0] = 0.0;
// dbdr[1] = 0.0;
// dbdr[2] = 0.0;
t_scalar3<double> dbdr,sumzdu_r;
// Sum terms Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb)
int jjz = idxz_block(j1,j2,j);
int jju = idxu_block[j];
for(int mb = 0; 2*mb < j; mb++)
for(int ma = 0; ma <= j; ma++) {
const int jju_index = jju+mb*(j+1)+ma;
const int jjz_index = jjz+mb*(j+1)+ma;
sumzdu_r.x += (dulist_r(jju_index,0) * zlist_r[jjz_index] + dulist_i(jju_index,0) * zlist_i[jjz_index]);
sumzdu_r.y += (dulist_r(jju_index,1) * zlist_r[jjz_index] + dulist_i(jju_index,1) * zlist_i[jjz_index]);
sumzdu_r.z += (dulist_r(jju_index,2) * zlist_r[jjz_index] + dulist_i(jju_index,2) * zlist_i[jjz_index]);
} //end loop over ma mb
// For j even, handle middle column
if (j%2 == 0) {
int mb = j/2;
for(int ma = 0; ma <= mb; ma++) {
const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
sumzdu_r.x += (dulist_r(jju_index,0) * zlist_r[jjz_index] + dulist_i(jju_index,0) * zlist_i[jjz_index]);
sumzdu_r.y += (dulist_r(jju_index,1) * zlist_r[jjz_index] + dulist_i(jju_index,1) * zlist_i[jjz_index]);
sumzdu_r.z += (dulist_r(jju_index,2) * zlist_r[jjz_index] + dulist_i(jju_index,2) * zlist_i[jjz_index]);
}
int ma = mb;
const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
for(int k = 0; k < 3; k++) {
sumzdu_r.x += (dulist_r(jju_index,0) * zlist_r[jjz] + dulist_i(jju_index,0) * zlist_i[jjz_index])*0.5;
sumzdu_r.y += (dulist_r(jju_index,1) * zlist_r[jjz] + dulist_i(jju_index,1) * zlist_i[jjz_index])*0.5;
sumzdu_r.z += (dulist_r(jju_index,2) * zlist_r[jjz] + dulist_i(jju_index,2) * zlist_i[jjz_index])*0.5;
}
} // end if jeven
dbdr += 2.0*sumzdu_r;
// Sum over Conj(dudr(j1,ma1,mb1))*z(j,j2,j1,ma1,mb1)
double j1fac = (j+1)/(j1+1.0);
jjz = idxz_block(j,j2,j1);
jju = idxu_block[j1];
sumzdu_r.x = 0.0; sumzdu_r.y = 0.0; sumzdu_r.z = 0.0;
for(int mb = 0; 2*mb < j1; mb++)
for(int ma = 0; ma <= j1; ma++) {
const int jju_index = jju+mb*(j1+1)+ma;
const int jjz_index = jjz+mb*(j1+1)+ma;
sumzdu_r.x += (dulist_r(jju_index,0) * zlist_r[jjz_index] + dulist_i(jju_index,0) * zlist_i[jjz_index]);
sumzdu_r.y += (dulist_r(jju_index,1) * zlist_r[jjz_index] + dulist_i(jju_index,1) * zlist_i[jjz_index]);
sumzdu_r.z += (dulist_r(jju_index,2) * zlist_r[jjz_index] + dulist_i(jju_index,2) * zlist_i[jjz_index]);
} //end loop over ma1 mb1
// For j1 even, handle middle column
if (j1%2 == 0) {
const int mb = j1/2;
for(int ma = 0; ma <= mb; ma++) {
const int jju_index = jju+(mb-1)*(j1+1)+(j1+1)+ma;
const int jjz_index = jjz+(mb-1)*(j1+1)+(j1+1)+ma;
sumzdu_r.x += (dulist_r(jju_index,0) * zlist_r[jjz_index] + dulist_i(jju_index,0) * zlist_i[jjz_index]);
sumzdu_r.y += (dulist_r(jju_index,1) * zlist_r[jjz_index] + dulist_i(jju_index,1) * zlist_i[jjz_index]);
sumzdu_r.z += (dulist_r(jju_index,2) * zlist_r[jjz_index] + dulist_i(jju_index,2) * zlist_i[jjz_index]);
}
int ma = mb;
const int jju_index = jju+(mb-1)*(j1+1)+(j1+1)+ma;
const int jjz_index = jjz+(mb-1)*(j1+1)+(j1+1)+ma;
for(int k = 0; k < 3; k++) {
sumzdu_r.x += (dulist_r(jju_index,0) * zlist_r[jjz] + dulist_i(jju_index,0) * zlist_i[jjz_index])*0.5;
sumzdu_r.y += (dulist_r(jju_index,1) * zlist_r[jjz] + dulist_i(jju_index,1) * zlist_i[jjz_index])*0.5;
sumzdu_r.z += (dulist_r(jju_index,2) * zlist_r[jjz] + dulist_i(jju_index,2) * zlist_i[jjz_index])*0.5;
}
} // end if j1even
dbdr += 2.0*sumzdu_r*j1fac;
// Sum over Conj(dudr(j2,ma2,mb2))*z(j1,j,j2,ma2,mb2)
double j2fac = (j+1)/(j2+1.0);
jjz = idxz_block(j,j1,j2);
jju = idxu_block[j2];
sumzdu_r.x = 0.0; sumzdu_r.y = 0.0; sumzdu_r.z = 0.0;
for(int mb = 0; 2*mb < j2; mb++)
for(int ma = 0; ma <= j2; ma++) {
const int jju_index = jju+mb*(j2+1)+ma;
const int jjz_index = jjz+mb*(j2+1)+ma;
sumzdu_r.x += (dulist_r(jju_index,0) * zlist_r[jjz_index] + dulist_i(jju_index,0) * zlist_i[jjz_index]);
sumzdu_r.y += (dulist_r(jju_index,1) * zlist_r[jjz_index] + dulist_i(jju_index,1) * zlist_i[jjz_index]);
sumzdu_r.z += (dulist_r(jju_index,2) * zlist_r[jjz_index] + dulist_i(jju_index,2) * zlist_i[jjz_index]);
} //end loop over ma2 mb2
// For j2 even, handle middle column
if (j2%2 == 0) {
const int mb = j2/2;
for(int ma = 0; ma <= mb; ma++) {
const int jju_index = jju+(mb-1)*(j2+1)+(j2+1)+ma;
const int jjz_index = jjz+(mb-1)*(j2+1)+(j2+1)+ma;
sumzdu_r.x += (dulist_r(jju_index,0) * zlist_r[jjz_index] + dulist_i(jju_index,0) * zlist_i[jjz_index]);
sumzdu_r.y += (dulist_r(jju_index,1) * zlist_r[jjz_index] + dulist_i(jju_index,1) * zlist_i[jjz_index]);
sumzdu_r.z += (dulist_r(jju_index,2) * zlist_r[jjz_index] + dulist_i(jju_index,2) * zlist_i[jjz_index]);
}
int ma = mb;
const int jju_index = jju+(mb-1)*(j2+1)+(j2+1)+ma;
const int jjz_index = jjz+(mb-1)*(j2+1)+(j2+1)+ma;
for(int k = 0; k < 3; k++) {
sumzdu_r.x += (dulist_r(jju_index,0) * zlist_r[jjz] + dulist_i(jju_index,0) * zlist_i[jjz_index])*0.5;
sumzdu_r.y += (dulist_r(jju_index,1) * zlist_r[jjz] + dulist_i(jju_index,1) * zlist_i[jjz_index])*0.5;
sumzdu_r.z += (dulist_r(jju_index,2) * zlist_r[jjz] + dulist_i(jju_index,2) * zlist_i[jjz_index])*0.5;
}
} // end if j2even
dbdr += 2.0*sumzdu_r*j2fac;
dblist(jjb,0) = dbdr.x;
dblist(jjb,1) = dbdr.y;
dblist(jjb,2) = dbdr.z;
}); //end loop over j1 j2 j
}
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
calculate derivative of Ui w.r.t. atom j calculate derivative of Ui w.r.t. atom j
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_duidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, void SNAKokkos<DeviceType>::compute_duidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
double* rij, double wj, double rcut, int jj)
{ {
double rsq, r, x, y, z, z0, theta0, cs, sn; double rsq, r, x, y, z, z0, theta0, cs, sn;
double dz0dr; double dz0dr;
x = rij[0]; x = rij(iatom,jnbor,0);
y = rij[1]; y = rij(iatom,jnbor,1);
z = rij[2]; z = rij(iatom,jnbor,2);
rsq = x * x + y * y + z * z; rsq = x * x + y * y + z * z;
r = sqrt(rsq); r = sqrt(rsq);
double rscale0 = rfac0 * MY_PI / (rcut - rmin0); double rscale0 = rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
theta0 = (r - rmin0) * rscale0; theta0 = (r - rmin0) * rscale0;
cs = cos(theta0); cs = cos(theta0);
sn = sin(theta0); sn = sin(theta0);
z0 = r * cs / sn; z0 = r * cs / sn;
dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
compute_duarray(team, x, y, z, z0, r, dz0dr, wj, rcut, jj); compute_duarray(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor));
} }
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::zero_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team) void SNAKokkos<DeviceType>::zero_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
{ {
{ {
double* const ptr = ulisttot_r.data(); Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(1)),
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot_r.span()),
[&] (const int& i) { [&] (const int& i) {
ptr[i] = 0.0; ulisttot(iatom,i).re = 0.0;
}); ulisttot(iatom,i).im = 0.0;
}
{
double* const ptr = ulisttot_i.data();
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot_i.span()),
[&] (const int& i) {
ptr[i] = 0.0;
}); });
} }
} }
@ -822,15 +663,15 @@ void SNAKokkos<DeviceType>::zero_uarraytot(const typename Kokkos::TeamPolicy<Dev
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::addself_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, double wself_in) void SNAKokkos<DeviceType>::addself_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, double wself_in)
{ {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1), Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1),
[&] (const int& j) { [&] (const int& j) {
//for (int j = 0; j <= twojmax; j++) //for (int j = 0; j <= twojmax; j++)
int jju = idxu_block[j]; int jju = idxu_block[j];
for (int ma = 0; ma <= j; ma++) { for (int ma = 0; ma <= j; ma++) {
ulisttot_r[jju] = wself_in; ulisttot(iatom,jju).re = wself_in;
ulisttot_i[jju] = 0.0; ulisttot(iatom,jju).im = 0.0;
jju += j+2; jju += j+2;
} }
}); });
@ -842,28 +683,15 @@ void SNAKokkos<DeviceType>::addself_uarraytot(const typename Kokkos::TeamPolicy<
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, void SNAKokkos<DeviceType>::add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
double r, double wj, double rcut, int j) double r, double wj, double rcut)
{ {
const double sfac = compute_sfac(r, rcut) * wj; const double sfac = compute_sfac(r, rcut) * wj;
const double* const ptr_r = ulist_r.data(); Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(1)),
const double* const ptr_i = ulist_i.data();
double* const ptrtot_r = ulisttot_r.data();
double* const ptrtot_i = ulisttot_i.data();
Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
ulist_r_j(ulist_r_ij,j,Kokkos::ALL);
Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
ulist_i_j(ulist_i_ij,j,Kokkos::ALL);
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot_r.span()),
[&] (const int& i) { [&] (const int& i) {
Kokkos::atomic_add(ptrtot_r+i, sfac * ptr_r[i]); Kokkos::atomic_add(&(ulisttot(iatom,i).re), sfac * ulist(iatom,jnbor,i).re);
Kokkos::atomic_add(ptrtot_i+i, sfac * ptr_i[i]); Kokkos::atomic_add(&(ulisttot(iatom,i).im), sfac * ulist(iatom,jnbor,i).im);
ulist_r_j(i) = ulist_r(i);
ulist_i_j(i) = ulist_i(i);
}); });
} }
@ -873,7 +701,7 @@ void SNAKokkos<DeviceType>::add_uarraytot(const typename Kokkos::TeamPolicy<Devi
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
double x, double y, double z, double x, double y, double z,
double z0, double r) double z0, double r)
{ {
@ -891,8 +719,8 @@ void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<Dev
// VMK Section 4.8.2 // VMK Section 4.8.2
ulist_r[0] = 1.0; ulist(iatom,jnbor,0).re = 1.0;
ulist_i[0] = 0.0; ulist(iatom,jnbor,0).im = 0.0;
for (int j = 1; j <= twojmax; j++) { for (int j = 1; j <= twojmax; j++) {
int jju = idxu_block[j]; int jju = idxu_block[j];
@ -904,31 +732,31 @@ void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<Dev
[&] (const int& mb) { [&] (const int& mb) {
//for (int mb = 0; 2*mb <= j; mb++) { //for (int mb = 0; 2*mb <= j; mb++) {
const int jju_index = jju+mb+mb*j; const int jju_index = jju+mb+mb*j;
ulist_r[jju_index] = 0.0; ulist(iatom,jnbor,jju_index).re = 0.0;
ulist_i[jju_index] = 0.0; ulist(iatom,jnbor,jju_index).im = 0.0;
for (int ma = 0; ma < j; ma++) { for (int ma = 0; ma < j; ma++) {
const int jju_index = jju+mb+mb*j+ma; const int jju_index = jju+mb+mb*j+ma;
const int jjup_index = jjup+mb*j+ma; const int jjup_index = jjup+mb*j+ma;
rootpq = rootpqarray(j - ma,j - mb); rootpq = rootpqarray(j - ma,j - mb);
ulist_r[jju_index] += ulist(iatom,jnbor,jju_index).re +=
rootpq * rootpq *
(a_r * ulist_r[jjup_index] + (a_r * ulist(iatom,jnbor,jjup_index).re +
a_i * ulist_i[jjup_index]); a_i * ulist(iatom,jnbor,jjup_index).im);
ulist_i[jju_index] += ulist(iatom,jnbor,jju_index).im +=
rootpq * rootpq *
(a_r * ulist_i[jjup_index] - (a_r * ulist(iatom,jnbor,jjup_index).im -
a_i * ulist_r[jjup_index]); a_i * ulist(iatom,jnbor,jjup_index).re);
rootpq = rootpqarray(ma + 1,j - mb); rootpq = rootpqarray(ma + 1,j - mb);
ulist_r[jju_index+1] = ulist(iatom,jnbor,jju_index+1).re =
-rootpq * -rootpq *
(b_r * ulist_r[jjup_index] + (b_r * ulist(iatom,jnbor,jjup_index).re +
b_i * ulist_i[jjup_index]); b_i * ulist(iatom,jnbor,jjup_index).im);
ulist_i[jju_index+1] = ulist(iatom,jnbor,jju_index+1).im =
-rootpq * -rootpq *
(b_r * ulist_i[jjup_index] - (b_r * ulist(iatom,jnbor,jjup_index).im -
b_i * ulist_r[jjup_index]); b_i * ulist(iatom,jnbor,jjup_index).re);
} }
}); });
@ -946,11 +774,11 @@ void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<Dev
const int jju_index = jju+mb*(j+1)+ma; const int jju_index = jju+mb*(j+1)+ma;
const int jjup_index = jjup-mb*(j+1)-ma; const int jjup_index = jjup-mb*(j+1)-ma;
if (mapar == 1) { if (mapar == 1) {
ulist_r[jjup_index] = ulist_r[jju_index]; ulist(iatom,jnbor,jjup_index).re = ulist(iatom,jnbor,jju_index).re;
ulist_i[jjup_index] = -ulist_i[jju_index]; ulist(iatom,jnbor,jjup_index).im = -ulist(iatom,jnbor,jju_index).im;
} else { } else {
ulist_r[jjup_index] = -ulist_r[jju_index]; ulist(iatom,jnbor,jjup_index).re = -ulist(iatom,jnbor,jju_index).re;
ulist_i[jjup_index] = ulist_i[jju_index]; ulist(iatom,jnbor,jjup_index).im = ulist(iatom,jnbor,jju_index).im;
} }
mapar = -mapar; mapar = -mapar;
} }
@ -965,10 +793,10 @@ void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<Dev
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_duarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, void SNAKokkos<DeviceType>::compute_duarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
double x, double y, double z, double x, double y, double z,
double z0, double r, double dz0dr, double z0, double r, double dz0dr,
double wj, double rcut, int jj) double wj, double rcut)
{ {
double r0inv; double r0inv;
double a_r, a_i, b_r, b_i; double a_r, a_i, b_r, b_i;
@ -1012,17 +840,12 @@ void SNAKokkos<DeviceType>::compute_duarray(const typename Kokkos::TeamPolicy<De
db_i[0] += -r0inv; db_i[0] += -r0inv;
db_r[1] += r0inv; db_r[1] += r0inv;
Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>> dulist(iatom,jnbor,0,0).re = 0.0;
ulist_r(ulist_r_ij,jj,Kokkos::ALL); dulist(iatom,jnbor,0,1).re = 0.0;
Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>> dulist(iatom,jnbor,0,2).re = 0.0;
ulist_i(ulist_i_ij,jj,Kokkos::ALL); dulist(iatom,jnbor,0,0).im = 0.0;
dulist(iatom,jnbor,0,1).im = 0.0;
dulist_r(0,0) = 0.0; dulist(iatom,jnbor,0,2).im = 0.0;
dulist_r(0,1) = 0.0;
dulist_r(0,2) = 0.0;
dulist_i(0,0) = 0.0;
dulist_i(0,1) = 0.0;
dulist_i(0,2) = 0.0;
for (int j = 1; j <= twojmax; j++) { for (int j = 1; j <= twojmax; j++) {
int jju = idxu_block[j]; int jju = idxu_block[j];
@ -1031,42 +854,42 @@ void SNAKokkos<DeviceType>::compute_duarray(const typename Kokkos::TeamPolicy<De
[&] (const int& mb) { [&] (const int& mb) {
//for (int mb = 0; 2*mb <= j; mb++) { //for (int mb = 0; 2*mb <= j; mb++) {
const int jju_index = jju+mb+mb*j; const int jju_index = jju+mb+mb*j;
dulist_r(jju_index,0) = 0.0; dulist(iatom,jnbor,jju_index,0).re = 0.0;
dulist_r(jju_index,1) = 0.0; dulist(iatom,jnbor,jju_index,1).re = 0.0;
dulist_r(jju_index,2) = 0.0; dulist(iatom,jnbor,jju_index,2).re = 0.0;
dulist_i(jju_index,0) = 0.0; dulist(iatom,jnbor,jju_index,0).im = 0.0;
dulist_i(jju_index,1) = 0.0; dulist(iatom,jnbor,jju_index,1).im = 0.0;
dulist_i(jju_index,2) = 0.0; dulist(iatom,jnbor,jju_index,2).im = 0.0;
for (int ma = 0; ma < j; ma++) { for (int ma = 0; ma < j; ma++) {
const int jju_index = jju+mb+mb*j+ma; const int jju_index = jju+mb+mb*j+ma;
const int jjup_index = jjup+mb*j+ma; const int jjup_index = jjup+mb*j+ma;
rootpq = rootpqarray(j - ma,j - mb); rootpq = rootpqarray(j - ma,j - mb);
for (int k = 0; k < 3; k++) { for (int k = 0; k < 3; k++) {
dulist_r(jju_index,k) += dulist(iatom,jnbor,jju_index,k).re +=
rootpq * (da_r[k] * ulist_r[jjup_index] + rootpq * (da_r[k] * ulist(iatom,jnbor,jjup_index).re +
da_i[k] * ulist_i[jjup_index] + da_i[k] * ulist(iatom,jnbor,jjup_index).im +
a_r * dulist_r(jjup_index,k) + a_r * dulist(iatom,jnbor,jjup_index,k).re +
a_i * dulist_i(jjup_index,k)); a_i * dulist(iatom,jnbor,jjup_index,k).im);
dulist_i(jju_index,k) += dulist(iatom,jnbor,jju_index,k).im +=
rootpq * (da_r[k] * ulist_i[jjup_index] - rootpq * (da_r[k] * ulist(iatom,jnbor,jjup_index).im -
da_i[k] * ulist_r[jjup_index] + da_i[k] * ulist(iatom,jnbor,jjup_index).re +
a_r * dulist_i(jjup_index,k) - a_r * dulist(iatom,jnbor,jjup_index,k).im -
a_i * dulist_r(jjup_index,k)); a_i * dulist(iatom,jnbor,jjup_index,k).re);
} }
rootpq = rootpqarray(ma + 1,j - mb); rootpq = rootpqarray(ma + 1,j - mb);
for (int k = 0; k < 3; k++) { for (int k = 0; k < 3; k++) {
dulist_r(jju_index+1,k) = dulist(iatom,jnbor,jju_index+1,k).re =
-rootpq * (db_r[k] * ulist_r[jjup_index] + -rootpq * (db_r[k] * ulist(iatom,jnbor,jjup_index).re +
db_i[k] * ulist_i[jjup_index] + db_i[k] * ulist(iatom,jnbor,jjup_index).im +
b_r * dulist_r(jjup_index,k) + b_r * dulist(iatom,jnbor,jjup_index,k).re +
b_i * dulist_i(jjup_index,k)); b_i * dulist(iatom,jnbor,jjup_index,k).im);
dulist_i(jju_index+1,k) = dulist(iatom,jnbor,jju_index+1,k).im =
-rootpq * (db_r[k] * ulist_i[jjup_index] - -rootpq * (db_r[k] * ulist(iatom,jnbor,jjup_index).im -
db_i[k] * ulist_r[jjup_index] + db_i[k] * ulist(iatom,jnbor,jjup_index).re +
b_r * dulist_i(jjup_index,k) - b_r * dulist(iatom,jnbor,jjup_index,k).im -
b_i * dulist_r(jjup_index,k)); b_i * dulist(iatom,jnbor,jjup_index,k).re);
} }
} }
}); });
@ -1086,13 +909,13 @@ void SNAKokkos<DeviceType>::compute_duarray(const typename Kokkos::TeamPolicy<De
const int jjup_index = jjup-mb*(j+1)-ma; const int jjup_index = jjup-mb*(j+1)-ma;
if (mapar == 1) { if (mapar == 1) {
for (int k = 0; k < 3; k++) { for (int k = 0; k < 3; k++) {
dulist_r(jjup_index,k) = dulist_r(jju_index,k); dulist(iatom,jnbor,jjup_index,k).re = dulist(iatom,jnbor,jju_index,k).re;
dulist_i(jjup_index,k) = -dulist_i(jju_index,k); dulist(iatom,jnbor,jjup_index,k).im = -dulist(iatom,jnbor,jju_index,k).im;
} }
} else { } else {
for (int k = 0; k < 3; k++) { for (int k = 0; k < 3; k++) {
dulist_r(jjup_index,k) = -dulist_r(jju_index,k); dulist(iatom,jnbor,jjup_index,k).re = -dulist(iatom,jnbor,jju_index,k).re;
dulist_i(jjup_index,k) = dulist_i(jju_index,k); dulist(iatom,jnbor,jjup_index,k).im = dulist(iatom,jnbor,jju_index,k).im;
} }
} }
mapar = -mapar; mapar = -mapar;
@ -1110,89 +933,24 @@ void SNAKokkos<DeviceType>::compute_duarray(const typename Kokkos::TeamPolicy<De
int jju = idxu_block[j]; int jju = idxu_block[j];
for (int mb = 0; 2*mb <= j; mb++) for (int mb = 0; 2*mb <= j; mb++)
for (int ma = 0; ma <= j; ma++) { for (int ma = 0; ma <= j; ma++) {
dulist_r(jju,0) = dsfac * ulist_r[jju] * ux + dulist(iatom,jnbor,jju,0).re = dsfac * ulist(iatom,jnbor,jju).re * ux +
sfac * dulist_r(jju,0); sfac * dulist(iatom,jnbor,jju,0).re;
dulist_i(jju,0) = dsfac * ulist_i[jju] * ux + dulist(iatom,jnbor,jju,0).im = dsfac * ulist(iatom,jnbor,jju).im * ux +
sfac * dulist_i(jju,0); sfac * dulist(iatom,jnbor,jju,0).im;
dulist_r(jju,1) = dsfac * ulist_r[jju] * uy + dulist(iatom,jnbor,jju,1).re = dsfac * ulist(iatom,jnbor,jju).re * uy +
sfac * dulist_r(jju,1); sfac * dulist(iatom,jnbor,jju,1).re;
dulist_i(jju,1) = dsfac * ulist_i[jju] * uy + dulist(iatom,jnbor,jju,1).im = dsfac * ulist(iatom,jnbor,jju).im * uy +
sfac * dulist_i(jju,1); sfac * dulist(iatom,jnbor,jju,1).im;
dulist_r(jju,2) = dsfac * ulist_r[jju] * uz + dulist(iatom,jnbor,jju,2).re = dsfac * ulist(iatom,jnbor,jju).re * uz +
sfac * dulist_r(jju,2); sfac * dulist(iatom,jnbor,jju,2).re;
dulist_i(jju,2) = dsfac * ulist_i[jju] * uz + dulist(iatom,jnbor,jju,2).im = dsfac * ulist(iatom,jnbor,jju).im * uz +
sfac * dulist_i(jju,2); sfac * dulist(iatom,jnbor,jju,2).im;
jju++; jju++;
} }
} }
} }
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::create_team_scratch_arrays(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team)
{
ulisttot_r_a = ulisttot_r = t_sna_1d(team.team_scratch(1),idxu_max);
ulisttot_i_a = ulisttot_i = t_sna_1d(team.team_scratch(1),idxu_max);
ylist_r = t_sna_1d(team.team_scratch(1),idxu_max);
ylist_i = t_sna_1d(team.team_scratch(1),idxu_max);
zlist_r = t_sna_1d(team.team_scratch(1),idxz_max);
zlist_i = t_sna_1d(team.team_scratch(1),idxz_max);
blist = t_sna_1d(team.team_scratch(1),idxb_max);
rij = t_sna_2d(team.team_scratch(1),nmax,3);
rcutij = t_sna_1d(team.team_scratch(1),nmax);
wj = t_sna_1d(team.team_scratch(1),nmax);
inside = t_sna_1i(team.team_scratch(1),nmax);
ulist_r_ij = t_sna_2d(team.team_scratch(1),nmax,idxu_max);
ulist_i_ij = t_sna_2d(team.team_scratch(1),nmax,idxu_max);
}
template<class DeviceType>
inline
T_INT SNAKokkos<DeviceType>::size_team_scratch_arrays() {
T_INT size = 0;
size += t_sna_1d::shmem_size(idxu_max)*2; // ulisttot
size += t_sna_1d::shmem_size(idxu_max)*2; // ylist
size += t_sna_1d::shmem_size(idxz_max)*2; // zlist
size += t_sna_1d::shmem_size(idxb_max); // blist
size += t_sna_2d::shmem_size(nmax,3); // rij
size += t_sna_1d::shmem_size(nmax); // rcutij
size += t_sna_1d::shmem_size(nmax); // wj
size += t_sna_1i::shmem_size(nmax); // inside
size += t_sna_2d::shmem_size(nmax,idxu_max)*2; // ulist_ij
return size;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::create_thread_scratch_arrays(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team)
{
dblist = t_sna_2d(team.thread_scratch(1),idxb_max,3);
ulist_r = t_sna_1d(team.thread_scratch(1),idxu_max);
ulist_i = t_sna_1d(team.thread_scratch(1),idxu_max);
dulist_r = t_sna_2d(team.thread_scratch(1),idxu_max,3);
dulist_i = t_sna_2d(team.thread_scratch(1),idxu_max,3);
}
template<class DeviceType>
inline
T_INT SNAKokkos<DeviceType>::size_thread_scratch_arrays() {
T_INT size = 0;
size += t_sna_2d::shmem_size(idxb_max,3); // dblist
size += t_sna_1d::shmem_size(idxu_max)*2; // ulist
size += t_sna_2d::shmem_size(idxu_max,3)*2; // dulist
return size;
}
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
factorial n, wrapper for precomputed table factorial n, wrapper for precomputed table
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
@ -1558,14 +1316,13 @@ double SNAKokkos<DeviceType>::memory_usage()
bytes += jdimpq*jdimpq * sizeof(double); // pqarray bytes += jdimpq*jdimpq * sizeof(double); // pqarray
bytes += idxcg_max * sizeof(double); // cglist bytes += idxcg_max * sizeof(double); // cglist
bytes += idxu_max * sizeof(double) * 2; // ulist bytes += natom * idxu_max * sizeof(double) * 2; // ulist
bytes += idxu_max * sizeof(double) * 2; // ulisttot bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot
bytes += idxu_max * 3 * sizeof(double) * 2; // dulist bytes += natom * idxu_max * 3 * sizeof(double) * 2; // dulist
bytes += idxz_max * sizeof(double) * 2; // zlist bytes += natom * idxz_max * sizeof(double) * 2; // zlist
bytes += idxb_max * sizeof(double); // blist bytes += natom * idxb_max * sizeof(double); // blist
bytes += idxb_max * 3 * sizeof(double); // dblist bytes += natom * idxu_max * sizeof(double) * 2; // ylist
bytes += idxu_max * sizeof(double) * 2; // ylist
bytes += jdim * jdim * jdim * sizeof(int); // idxcg_block bytes += jdim * jdim * jdim * sizeof(int); // idxcg_block
bytes += jdim * sizeof(int); // idxu_block bytes += jdim * sizeof(int); // idxu_block
@ -1577,11 +1334,11 @@ double SNAKokkos<DeviceType>::memory_usage()
bytes += jdim * sizeof(double); // bzero bytes += jdim * sizeof(double); // bzero
bytes += nmax * 3 * sizeof(double); // rij bytes += natom * nmax * 3 * sizeof(double); // rij
bytes += nmax * sizeof(int); // inside bytes += natom * nmax * sizeof(int); // inside
bytes += nmax * sizeof(double); // wj bytes += natom * nmax * sizeof(double); // wj
bytes += nmax * sizeof(double); // rcutij bytes += natom * nmax * sizeof(double); // rcutij
bytes += nmax * idxu_max * sizeof(double) * 2; // ulist_ij bytes += natom * nmax * idxu_max * sizeof(double) * 2; // ulist_ij
return bytes; return bytes;
} }