Added support for symmetrized memory layouts for ui/duidrj for the CPU- and GPU-path SNAP Kokkos implementation, various perf optimizations for ComputeUi/ComputeFusedDeidrj

This commit is contained in:
Evan Weinberg 2020-08-12 16:15:06 -04:00
parent ac43f8f685
commit a5d27763e5
5 changed files with 530 additions and 298 deletions

View File

@ -1058,7 +1058,7 @@ struct alignas(2*sizeof(real)) SNAComplex
{
real re,im;
KOKKOS_FORCEINLINE_FUNCTION SNAComplex() = default;
SNAComplex() = default;
KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real re)
: re(re), im(static_cast<real>(0.)) { ; }
@ -1100,6 +1100,17 @@ KOKKOS_FORCEINLINE_FUNCTION SNAComplex<real> operator*(const real& r, const SNAC
typedef SNAComplex<SNAreal> SNAcomplex;
// Cayley-Klein pack
// Can guarantee it's aligned to 2 complex
struct alignas(32) CayleyKleinPack {
SNAcomplex a, b;
SNAcomplex da[3], db[3];
SNAreal sfac;
SNAreal dsfacu[3];
};
#if defined(KOKKOS_ENABLE_CXX11)
#undef ISFINITE

View File

@ -50,6 +50,7 @@ struct TagPairSNAPComputeFusedDeidrj{};
// CPU backend only
struct TagPairSNAPPreUiCPU{};
struct TagPairSNAPComputeUiCPU{};
struct TagPairSNAPTransformUiCPU{};
struct TagPairSNAPComputeZiCPU{};
struct TagPairSNAPBetaCPU{};
struct TagPairSNAPComputeBiCPU{};
@ -104,7 +105,7 @@ public:
void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const;
void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
@ -134,15 +135,15 @@ public:
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeZiCPU,const int& ii) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBiCPU>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYiCPU>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;

View File

@ -206,8 +206,6 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
EV_FLOAT ev;
int idxu_max = snaKK.idxu_max;
while (chunk_offset < inum) { // chunk up loop to prevent running out of memory
EV_FLOAT ev_tmp;
@ -246,6 +244,13 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
}
{
// Expand ulisttot -> ulisttot_full
// Zero out ylist
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUiCPU> policy_transform_ui_cpu({0,0},{twojmax+1,chunk_size});
Kokkos::parallel_for("TransformUiCPU",policy_transform_ui_cpu,*this);
}
//Compute bispectrum
if (quadraticflag || eflag) {
//ComputeZi
@ -261,20 +266,12 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
}
//ZeroYi,ComputeYi
//ComputeYi
{
int vector_length = vector_length_default;
int team_size = team_size_default;
//Compute beta = dE_i/dB_i for all i in list
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBetaCPU> policy_beta(0,chunk_size);
Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this);
//ZeroYi
check_team_size_for<TagPairSNAPZeroYiCPU>(chunk_size,team_size,vector_length);
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
Kokkos::parallel_for("ZeroYiCPU",policy_zero_yi,*this);
//ComputeYi
int idxz_max = snaKK.idxz_max;
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiCPU> policy_yi_cpu(0,chunk_size*idxz_max);
@ -294,6 +291,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
}
} else { // GPU
#ifdef LMP_KOKKOS_GPU
@ -313,10 +311,10 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
int team_size = 4; // need to cap b/c of shared memory reqs
check_team_size_for<TagPairSNAPComputeUi>(chunk_size,team_size,vector_length);
// scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
// scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values, div 2 for symmetry
// 2 is for double buffer
const int tile_size = (twojmax+1)*(twojmax+1);
const int tile_size = (twojmax+1)*(twojmax/2+1);
typedef Kokkos::View< SNAcomplex*,
Kokkos::DefaultExecutionSpace::scratch_memory_space,
Kokkos::MemoryTraits<Kokkos::Unmanaged> >
@ -329,7 +327,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
Kokkos::parallel_for("ComputeUi",policy_ui,*this);
//Transform data layout of ulisttot to AoSoA, zero ylist
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,twojmax+1,(chunk_size + 32 - 1) / 32},{32,4,1});
Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
}
@ -367,7 +365,8 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
//Transform data layout of ylist out of AoSoA
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
const int idxu_half_max = snaKK.idxu_half_max;
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_half_max,(chunk_size + 32 - 1) / 32},{32,4,1});
Kokkos::parallel_for("TransformYi",policy_transform_yi,*this);
}
@ -397,7 +396,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
}
}
#endif // KOKKOS_ENABLE_CUDA
#endif // LMP_KOKKOS_GPU
}
@ -608,12 +607,21 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen
if ( rsq < rnd_cutsq(itype,jtype) ) {
if (final) {
my_sna.rij(ii,offset,0) = dx;
my_sna.rij(ii,offset,1) = dy;
my_sna.rij(ii,offset,2) = dz;
#ifdef LMP_KOKKOS_GPU
if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
my_sna.compute_cayley_klein(ii, offset, dx, dy, dz, (radi + d_radelem[elem_j])*rcutfac,
d_wjelem[elem_j]);
} else {
#endif
my_sna.rij(ii,offset,0) = dx;
my_sna.rij(ii,offset,1) = dy;
my_sna.rij(ii,offset,2) = dz;
my_sna.wj(ii,offset) = d_wjelem[elem_j];
my_sna.rcutij(ii,offset) = (radi + d_radelem[elem_j])*rcutfac;
#ifdef LMP_KOKKOS_GPU
}
#endif
my_sna.inside(ii,offset) = j;
my_sna.wj(ii,offset) = d_wjelem[elem_j];
my_sna.rcutij(ii,offset) = (radi + d_radelem[elem_j])*rcutfac;
if (chemflag)
my_sna.element(ii,offset) = elem_j;
else
@ -704,27 +712,56 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const {
SNAKokkos<DeviceType> my_sna = snaKK;
const int iatom = iatom_mod + iatom_div * 32;
if (iatom >= chunk_size) return;
if (idxu >= my_sna.idxu_max) return;
if (j > twojmax) return;
int elem_count = chemflag ? nelements : 1;
for (int ielem = 0; ielem < elem_count; ielem++) {
const int jju_half = my_sna.idxu_half_block(j);
const int jju = my_sna.idxu_block(j);
const auto utot_re = my_sna.ulisttot_re(idxu, ielem, iatom);
const auto utot_im = my_sna.ulisttot_im(idxu, ielem, iatom);
for (int mb = 0; 2*mb <= j; mb++) {
for (int ma = 0; ma <= j; ma++) {
// Extract top half
my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
const int idxu_shift = mb * (j + 1) + ma;
const int idxu_half = jju_half + idxu_shift;
const int idxu = jju + idxu_shift;
my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div) = 0.;
my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div) = 0.;
auto utot_re = my_sna.ulisttot_re(idxu_half, ielem, iatom);
auto utot_im = my_sna.ulisttot_im(idxu_half, ielem, iatom);
// Store
my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
// Also zero yi
my_sna.ylist_pack_re(iatom_mod, idxu_half, ielem, iatom_div) = 0.;
my_sna.ylist_pack_im(iatom_mod, idxu_half, ielem, iatom_div) = 0.;
// Symmetric term
const int sign_factor = (((ma+mb)%2==0)?1:-1);
const int idxu_flip = jju + (j + 1 - mb) * (j + 1) - (ma + 1);
if (sign_factor == 1) {
utot_im = -utot_im;
} else {
utot_re = -utot_re;
}
my_sna.ulisttot_pack(iatom_mod, idxu_flip, ielem, iatom_div) = { utot_re, utot_im };
// No need to zero symmetrized ylist
//my_sna.ylist_pack_re(iatom_mod, idxu_flip, ielem, iatom_div) = 0.;
//my_sna.ylist_pack_im(iatom_mod, idxu_flip, ielem, iatom_div) = 0.;
}
}
}
}
template<class DeviceType>
@ -742,20 +779,20 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int iato
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const {
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu_half, const int iatom_div) const {
SNAKokkos<DeviceType> my_sna = snaKK;
const int iatom = iatom_mod + iatom_div * 32;
if (iatom >= chunk_size) return;
if (idxu >= my_sna.idxu_max) return;
if (idxu_half >= my_sna.idxu_half_max) return;
int elem_count = chemflag ? nelements : 1;
for (int ielem = 0; ielem < elem_count; ielem++) {
const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div);
const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div);
const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu_half, ielem, iatom_div);
const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu_half, ielem, iatom_div);
my_sna.ylist(idxu, ielem, iatom) = { y_re, y_im };
my_sna.ylist(idxu_half, ielem, iatom) = { y_re, y_im };
}
}
@ -904,22 +941,52 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typen
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU>::member_type& team) const {
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const {
SNAKokkos<DeviceType> my_sna = snaKK;
// Extract the quantum number
const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
if (idx >= my_sna.idxu_max) return;
if (iatom >= chunk_size) return;
// Extract the atomic index
const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
if (ii >= chunk_size) return;
if (j > twojmax) return;
if (chemflag)
for(int ielem = 0; ielem < nelements; ielem++)
my_sna.zero_yi_cpu(idx,ii,ielem);
else
my_sna.zero_yi_cpu(idx,ii,0);
int elem_count = chemflag ? nelements : 1;
// De-symmetrize ulisttot
for (int ielem = 0; ielem < elem_count; ielem++) {
const int jju_half = my_sna.idxu_half_block(j);
const int jju = my_sna.idxu_block(j);
for (int mb = 0; 2*mb <= j; mb++) {
for (int ma = 0; ma <= j; ma++) {
// Extract top half
const int idxu_shift = mb * (j + 1) + ma;
const int idxu_half = jju_half + idxu_shift;
const int idxu = jju + idxu_shift;
// Load ulist
auto utot = my_sna.ulisttot(idxu_half, ielem, iatom);
// Store
my_sna.ulisttot_full(idxu, ielem, iatom) = utot;
// Zero Yi
my_sna.ylist(idxu_half, ielem, iatom) = {0., 0.};
// Symmetric term
const int sign_factor = (((ma+mb)%2==0)?1:-1);
const int idxu_flip = jju + (j + 1 - mb) * (j + 1) - (ma + 1);
if (sign_factor == 1) {
utot.im = -utot.im;
} else {
utot.re = -utot.re;
}
my_sna.ulisttot_full(idxu_flip, ielem, iatom) = utot;
}
}
}
}
template<class DeviceType>

View File

@ -55,6 +55,8 @@ public:
typedef Kokkos::View<SNAcomplex**[3], DeviceType> t_sna_3c3;
typedef Kokkos::View<SNAcomplex*****, DeviceType> t_sna_5c;
typedef Kokkos::View<CayleyKleinPack**, DeviceType> t_sna_2ckp;
inline
SNAKokkos() {};
KOKKOS_INLINE_FUNCTION
@ -78,6 +80,9 @@ inline
// functions for bispectrum coefficients, GPU only
KOKKOS_INLINE_FUNCTION
void compute_cayley_klein(const int&, const int&, const double&, const double&,
const double&, const double&, const double&);
KOKKOS_INLINE_FUNCTION
void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); // ForceSNAP
@ -97,8 +102,6 @@ inline
KOKKOS_INLINE_FUNCTION
void compute_zi_cpu(const int&); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void zero_yi_cpu(const int&,const int&,const int&); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_yi_cpu(int,
const Kokkos::View<F_FLOAT**, DeviceType> &beta); // ForceSNAP
KOKKOS_INLINE_FUNCTION
@ -117,6 +120,8 @@ inline
double compute_sfac(double, double); // add_uarraytot, compute_duarray
KOKKOS_INLINE_FUNCTION
double compute_dsfac(double, double); // compute_duarray
KOKKOS_INLINE_FUNCTION
void compute_s_dsfac(const double, const double, double&, double&); // compute_cayley_klein
// efficient complex FMA
// efficient caxpy (i.e., y += a x)
@ -140,6 +145,9 @@ inline
//per sna class instance for OMP use
// Alternative to rij, wj, rcutij...
// just calculate everything up front
t_sna_2ckp cayleyklein;
// Per InFlight Particle
t_sna_3d rij;
@ -156,6 +164,7 @@ inline
t_sna_3d_ll blist;
t_sna_3c_ll ulisttot;
t_sna_3c_ll ulisttot_full; // un-folded ulisttot, cpu only
t_sna_3c_ll zlist;
t_sna_3c_ll ulist;
@ -173,7 +182,7 @@ inline
t_sna_4d_ll ylist_pack_re; // split real,
t_sna_4d_ll ylist_pack_im; // imag AoSoA layout
int idxcg_max, idxu_max, idxz_max, idxb_max;
int idxcg_max, idxu_max, idxu_half_max, idxu_cache_max, idxz_max, idxb_max;
// Chem snap counts
int nelements;
@ -188,7 +197,13 @@ private:
Kokkos::View<int*[10], DeviceType> idxz;
Kokkos::View<int*[3], DeviceType> idxb;
Kokkos::View<int***, DeviceType> idxcg_block;
public:
Kokkos::View<int*, DeviceType> idxu_block;
Kokkos::View<int*, DeviceType> idxu_half_block;
Kokkos::View<int*, DeviceType> idxu_cache_block;
private:
Kokkos::View<int***, DeviceType> idxz_block;
Kokkos::View<int***, DeviceType> idxb_block;

View File

@ -120,6 +120,36 @@ void SNAKokkos<DeviceType>::build_indexlist()
idxu_max = idxu_count;
Kokkos::deep_copy(idxu_block,h_idxu_block);
// index list for half uarray
idxu_half_block = Kokkos::View<int*, DeviceType>("SNAKokkos::idxu_half_block",jdim);
auto h_idxu_half_block = Kokkos::create_mirror_view(idxu_half_block);
int idxu_half_count = 0;
for(int j = 0; j <= twojmax; j++) {
h_idxu_half_block[j] = idxu_half_count;
for(int mb = 0; 2*mb <= j; mb++)
for(int ma = 0; ma <= j; ma++)
idxu_half_count++;
}
idxu_half_max = idxu_half_count;
Kokkos::deep_copy(idxu_half_block, h_idxu_half_block);
// index list for "cache" uarray
// this is the GPU scratch memory requirements
// applied the CPU structures
idxu_cache_block = Kokkos::View<int*, DeviceType>("SNAKokkos::idxu_cache_block",jdim);
auto h_idxu_cache_block = Kokkos::create_mirror_view(idxu_cache_block);
int idxu_cache_count = 0;
for(int j = 0; j <= twojmax; j++) {
h_idxu_cache_block[j] = idxu_cache_count;
for(int mb = 0; mb < ((j+3)/2); mb++)
for (int ma = 0; ma <= j; ma++)
idxu_cache_count++;
}
idxu_cache_max = idxu_cache_count;
Kokkos::deep_copy(idxu_cache_block, h_idxu_cache_block);
// index list for beta and B
int idxb_count = 0;
@ -201,15 +231,21 @@ void SNAKokkos<DeviceType>::build_indexlist()
h_idxz(idxz_count,8) = MIN(j1, (2 * mb - j + j2 + j1) / 2) - h_idxz(idxz_count,5) + 1;
// apply to z(j1,j2,j,ma,mb) to unique element of y(j)
// ylist is "compressed" via symmetry in its
// contraction with dulist
const int jju_half = h_idxu_half_block[j] + (j+1)*mb + ma;
h_idxz(idxz_count,9) = jju_half;
const int jju = h_idxu_block[j] + (j+1)*mb + ma;
h_idxz(idxz_count,9) = jju;
// no longer needed
//const int jju = h_idxu_block[j] + (j+1)*mb + ma;
//h_idxz(idxz_count,9) = jju;
idxz_count++;
}
}
Kokkos::deep_copy(idxz,h_idxz);
Kokkos::deep_copy(idxz_block,h_idxz_block);
}
/* ---------------------------------------------------------------------- */
@ -230,44 +266,47 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
natom = newnatom;
nmax = newnmax;
rij = t_sna_3d("sna:rij",natom,nmax,3);
inside = t_sna_2i("sna:inside",natom,nmax);
wj = t_sna_2d("sna:wj",natom,nmax);
rcutij = t_sna_2d("sna:rcutij",natom,nmax);
element = t_sna_2i("sna:rcutij",natom,nmax);
dedr = t_sna_3d("sna:dedr",natom,nmax,3);
inside = t_sna_2i(Kokkos::ViewAllocateWithoutInitializing("sna:inside"),natom,nmax);
element = t_sna_2i(Kokkos::ViewAllocateWithoutInitializing("sna:rcutij"),natom,nmax);
dedr = t_sna_3d(Kokkos::ViewAllocateWithoutInitializing("sna:dedr"),natom,nmax,3);
#ifdef LMP_KOKKOS_GPU
if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
// dummy allocation
ulisttot = t_sna_3c_ll("sna:ulisttot",1,1,1);
ulisttot_re = t_sna_3d_ll("sna:ulisttot_re",idxu_max,nelements,natom);
ulisttot_im = t_sna_3d_ll("sna:ulisttot_im",idxu_max,nelements,natom);
ulisttot_pack = t_sna_4c_ll("sna:ulisttot_pack",32,idxu_max,nelements,(natom+32-1)/32);
ulist = t_sna_3c_ll("sna:ulist",1,1,1);
zlist = t_sna_3c_ll("sna:zlist",1,1,1);
zlist_pack = t_sna_4c_ll("sna:zlist_pack",32,idxz_max,ndoubles,(natom+32-1)/32);
blist = t_sna_3d_ll("sna:blist",idxb_max,ntriples,natom);
blist_pack = t_sna_4d_ll("sna:blist_pack",32,idxb_max,ntriples,(natom+32-1)/32);
ylist = t_sna_3c_ll("sna:ylist",idxu_max,nelements,natom);
ylist_pack_re = t_sna_4d_ll("sna:ylist_pack_re",32,idxu_max,nelements,(natom+32-1)/32);
ylist_pack_im = t_sna_4d_ll("sna:ylist_pack_im",32,idxu_max,nelements,(natom+32-1)/32);
dulist = t_sna_4c3_ll("sna:dulist",1,1,1);
cayleyklein = t_sna_2ckp(Kokkos::ViewAllocateWithoutInitializing("sna:cayleyklein"), natom, nmax);
ulisttot = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot"),1,1,1); // dummy allocation
ulisttot_full = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot"),1,1,1);
ulisttot_re = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_re"),idxu_half_max,nelements,natom);
ulisttot_im = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_im"),idxu_half_max,nelements,natom);
ulisttot_pack = t_sna_4c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_pack"),32,idxu_max,nelements,(natom+32-1)/32);
ulist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulist"),1,1,1);
zlist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:zlist"),1,1,1);
zlist_pack = t_sna_4c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:zlist_pack"),32,idxz_max,ndoubles,(natom+32-1)/32);
blist = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:blist"),idxb_max,ntriples,natom);
blist_pack = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:blist_pack"),32,idxb_max,ntriples,(natom+32-1)/32);
ylist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist"),idxu_half_max,nelements,natom);
ylist_pack_re = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist_pack_re"),32,idxu_half_max,nelements,(natom+32-1)/32);
ylist_pack_im = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist_pack_im"),32,idxu_half_max,nelements,(natom+32-1)/32);
dulist = t_sna_4c3_ll(Kokkos::ViewAllocateWithoutInitializing("sna:dulist"),1,1,1);
} else {
#endif
ulisttot = t_sna_3c_ll("sna:ulisttot",idxu_max,nelements,natom);
ulisttot_re = t_sna_3d_ll("sna:ulisttot_re",1,1,1);
ulisttot_im = t_sna_3d_ll("sna:ulisttot_im",1,1,1);
ulisttot_pack = t_sna_4c_ll("sna:ulisttot_pack",1,1,1,1);
ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax);
zlist = t_sna_3c_ll("sna:zlist",idxz_max,ndoubles,natom);
zlist_pack = t_sna_4c_ll("sna:zlist_pack",1,1,1,1);
blist = t_sna_3d_ll("sna:blist",idxb_max,ntriples,natom);
blist_pack = t_sna_4d_ll("sna:blist_pack",1,1,1,1);
ylist = t_sna_3c_ll("sna:ylist",idxu_max,nelements,natom);
ylist_pack_re = t_sna_4d_ll("sna:ylist_pack_re",1,1,1,1);
ylist_pack_im = t_sna_4d_ll("sna:ylist_pack_im",1,1,1,1);
dulist = t_sna_4c3_ll("sna:dulist",idxu_max,natom,nmax);
rij = t_sna_3d(Kokkos::ViewAllocateWithoutInitializing("sna:rij"),natom,nmax,3);
wj = t_sna_2d(Kokkos::ViewAllocateWithoutInitializing("sna:wj"),natom,nmax);
rcutij = t_sna_2d(Kokkos::ViewAllocateWithoutInitializing("sna:rcutij"),natom,nmax);
ulisttot = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot"),idxu_half_max,nelements,natom);
ulisttot_full = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_full"),idxu_max,nelements,natom);
ulisttot_re = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_re"),1,1,1);
ulisttot_im = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_im"),1,1,1);
ulisttot_pack = t_sna_4c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_pack"),1,1,1,1);
ulist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulist"),idxu_cache_max,natom,nmax);
zlist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:zlist"),idxz_max,ndoubles,natom);
zlist_pack = t_sna_4c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:zlist_pack"),1,1,1,1);
blist = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:blist"),idxb_max,ntriples,natom);
blist_pack = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:blist_pack"),1,1,1,1);
ylist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist"),idxu_half_max,nelements,natom);
ylist_pack_re = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist_pack_re"),1,1,1,1);
ylist_pack_im = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist_pack_im"),1,1,1,1);
dulist = t_sna_4c3_ll(Kokkos::ViewAllocateWithoutInitializing("sna:dulist"),idxu_cache_max,natom,nmax);
#ifdef LMP_KOKKOS_GPU
}
@ -278,20 +317,111 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
* GPU routines
* ----------------------------------------------------------------------*/
/* ----------------------------------------------------------------------
Precompute the Cayley-Klein parameters and the derivatives thereof.
This routine better exploits parallelism than the GPU ComputeUi and
ComputeFusedDeidrj, which are one warp per atom-neighbor pair.
------------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_cayley_klein(const int& iatom, const int& jnbor, const double& x, const double& y,
const double& z, const double& rcut, const double& wj_local)
{
//const double x = rij(iatom,jnbor,0);
//const double y = rij(iatom,jnbor,1);
//const double z = rij(iatom,jnbor,2);
const double rsq = x * x + y * y + z * z;
const double r = sqrt(rsq);
//const double rcut = rcutij(iatom, jnbor);
const double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
const double theta0 = (r - rmin0) * rscale0;
double sn, cs;
sincos(theta0, &sn, &cs);
const double z0 = r * cs / sn;
const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
//const double wj_local = wj(iatom, jnbor);
double sfac, dsfac;
compute_s_dsfac(r, rcut, sfac, dsfac);
sfac *= wj_local;
dsfac *= wj_local;
const double rinv = 1.0 / r;
const double ux = x * rinv;
const double uy = y * rinv;
const double uz = z * rinv;
const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
const SNAcomplex a = { z0 * r0inv, -z * r0inv };
const SNAcomplex b = { r0inv * y, -r0inv * x };
const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
const double dr0invx = dr0invdr * ux;
const double dr0invy = dr0invdr * uy;
const double dr0invz = dr0invdr * uz;
const double dz0x = dz0dr * ux;
const double dz0y = dz0dr * uy;
const double dz0z = dz0dr * uz;
const SNAcomplex dax = { dz0x * r0inv + z0 * dr0invx, -z * dr0invx };
const SNAcomplex day = { dz0y * r0inv + z0 * dr0invy, -z * dr0invy };
const SNAcomplex daz = { dz0z * r0inv + z0 * dr0invz, -z * dr0invz - r0inv };
const SNAcomplex dbx = { y * dr0invx, -x * dr0invx - r0inv };
const SNAcomplex dby = { y * dr0invy + r0inv, -x * dr0invy };
const SNAcomplex dbz = { y * dr0invz, -x * dr0invz };
const double dsfacux = dsfac * ux;
const double dsfacuy = dsfac * uy;
const double dsfacuz = dsfac * uz;
CayleyKleinPack ckp{};
ckp.a = a;
ckp.b = b;
ckp.da[0] = dax;
ckp.db[0] = dbx;
ckp.da[1] = day;
ckp.db[1] = dby;
ckp.da[2] = daz;
ckp.db[2] = dbz;
ckp.sfac = sfac;
ckp.dsfacu[0] = dsfacux;
ckp.dsfacu[1] = dsfacuy;
ckp.dsfacu[2] = dsfacuz;
// Yes, this breaks the standard mantra of using SoA
// instead of AoS, but it's net fine because of the
// one warp per atom/neighbor pair for the recursive
// polynomials. There's good L1 reuse, anyway.
cayleyklein(iatom, jnbor) = ckp;
}
/* ----------------------------------------------------------------------
Initialize ulisttot with self-energy terms.
Ulisttot uses a "half" data layout which takes
advantage of the symmetry of the Wigner U matrices.
------------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, const int& ielem)
{
for (int jelem = 0; jelem < nelements; jelem++) {
for (int j = 0; j <= twojmax; j++) {
const int jju = idxu_block(j);
const int jju_half = idxu_half_block(j);
// Only diagonal elements get initialized
// for (int m = 0; m < (j+1)*(j+1); m++)
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)),
// Top half only: gets symmetrized by TransformUi
// for (int m = 0; m < (j+1)*(j/2+1); m++)
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)),
[&] (const int m) {
const int jjup = jju + m;
const int jjup = jju_half + m;
// if m is on the "diagonal", initialize it with the self energy.
// Otherwise zero it out
@ -323,40 +453,27 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
// utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
// get shared memory offset
const int max_m_tile = (twojmax+1)*(twojmax+1);
const int max_m_tile = (twojmax+1)*(twojmax/2+1);
const int team_rank = team.team_rank();
const int scratch_shift = team_rank * max_m_tile;
// double buffer
// shared memory double buffer
// Each level stores a (j+1) x ((j+3)/2) tile in preparation
// for the computation on the next level. This is the "cached"
// data layout, which also gets used on the CPU.
SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
const double x = rij(iatom,jnbor,0);
const double y = rij(iatom,jnbor,1);
const double z = rij(iatom,jnbor,2);
// Each warp is accessing the same pack: take advantage of
// L1 reuse
const CayleyKleinPack& ckp = cayleyklein(iatom, jnbor);
const SNAcomplex a = ckp.a;
const SNAcomplex b = ckp.b;
const double sfac = ckp.sfac;
const double wj_local = wj(iatom, jnbor);
const double rcut = rcutij(iatom, jnbor);
const int jelem = element(iatom, jnbor);
const double rsq = x * x + y * y + z * z;
const double r = sqrt(rsq);
const double theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
// theta0 = (r - rmin0) * rscale0;
const double cs = cos(theta0);
const double sn = sin(theta0);
const double z0 = r * cs / sn; // r / tan(theta0)
// Compute cutoff function
const double sfac = compute_sfac(r, rcut) * wj_local;
// compute Cayley-Klein parameters for unit quaternion,
// pack into complex number
const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
const SNAcomplex a = { r0inv * z0, -r0inv * z };
const SNAcomplex b = { r0inv * y, -r0inv * x };
// VMK Section 4.8.2
// All writes go to global memory and shared memory
@ -367,7 +484,8 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
});
for (int j = 1; j <= twojmax; j++) {
const int jju = idxu_block[j];
const int jju = idxu_half_block[j];
// fill in left side of matrix layer from previous layer
@ -377,8 +495,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
// for (int mb = 0; 2*mb <= j; mb++)
const int n_mb = j/2+1;
// the last (j / 2) can be avoided due to symmetry
const int total_iters = n_ma * n_mb - (j % 2 == 0 ? (j / 2) : 0);
const int total_iters = n_ma * n_mb;
//for (int m = 0; m < total_iters; m++) {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
@ -423,25 +540,22 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
// copy left side to right side with inversion symmetry VMK 4.4(2)
// u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
// if j is even (-> physical j integer), last element maps to self, skip
//if (!(m == total_iters - 1 && j % 2 == 0)) {
if (m < total_iters - 1 || j % 2 == 1) {
const int sign_factor = (((ma+mb)%2==0)?1:-1);
const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
const int jjup_flip = jju + jju_shared_flip; // jju+(j+1-mb)*(j+1)-(ma+1);
const int sign_factor = (((ma+mb)%2==0)?1:-1);
const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
if (sign_factor == 1) {
u_accum.im = -u_accum.im;
} else {
u_accum.re = -u_accum.re;
}
buf2[jju_shared_flip] = u_accum;
// split re, im to get fully coalesced atomic add
Kokkos::atomic_add(&(ulisttot_re(jjup_flip,jelem,iatom)), sfac * u_accum.re);
Kokkos::atomic_add(&(ulisttot_im(jjup_flip,jelem,iatom)), sfac * u_accum.im);
if (sign_factor == 1) {
u_accum.im = -u_accum.im;
} else {
u_accum.re = -u_accum.re;
}
if (j%2==1 && mb+1==n_mb) {
buf2[jju_shared_flip] = u_accum;
}
// symmetric part of ulisttot is generated in TransformUi
});
// In CUDA backend,
// ThreadVectorRange has a __syncwarp (appropriately masked for
@ -454,8 +568,11 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
}
}
/* ----------------------------------------------------------------------
compute Zi by summing over products of Ui, GPU version
compute Zi by summing over products of Ui,
AoSoA data layout to take advantage of coalescing, avoiding warp
divergence. GPU version
------------------------------------------------------------------------- */
template<class DeviceType>
@ -525,6 +642,8 @@ void SNAKokkos<DeviceType>::compute_zi(const int& iatom_mod, const int& jjz, con
/* ----------------------------------------------------------------------
compute Bi by summing conj(Ui)*Zi
AoSoA data layout to take advantage of coalescing, avoiding warp
divergence.
------------------------------------------------------------------------- */
template<class DeviceType>
@ -615,8 +734,9 @@ void SNAKokkos<DeviceType>::compute_bi(const int& iatom_mod, const int& jjb, con
/* ----------------------------------------------------------------------
compute Yi from Ui without storing Zi, looping over zlist indices,
GPU version
compute Yi from Ui without storing Zi, looping over zlist indices.
AoSoA data layout to take advantage of coalescing, avoiding warp
divergence. GPU version.
------------------------------------------------------------------------- */
template<class DeviceType>
@ -635,7 +755,7 @@ void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
const int mb2max = idxz(jjz, 6);
const int na = idxz(jjz, 7);
const int nb = idxz(jjz, 8);
const int jju = idxz(jjz, 9);
const int jju_half = idxz(jjz, 9);
const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
//int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
@ -677,8 +797,8 @@ void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
} // end loop over ib
if (bnorm_flag) {
ztmp_r /= j + 1;
ztmp_i /= j + 1;
ztmp_r /= (j + 1);
ztmp_i /= (j + 1);
}
// apply to z(j1,j2,j,ma,mb) to unique element of y(j)
@ -710,8 +830,8 @@ void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
betaj *= (j1 + 1) / (j + 1.0);
Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju, elem3, iatom_div)), betaj*ztmp_r);
Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju, elem3, iatom_div)), betaj*ztmp_i);
Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj*ztmp_r);
Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj*ztmp_i);
} // end loop over elem3
} // end loop over elem2
} // end loop over elem1
@ -719,7 +839,7 @@ void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
/* ----------------------------------------------------------------------
Fused calculation of the derivative of Ui w.r.t. atom j
and of dEidRj. GPU only.
and accumulation into dEidRj. GPU only.
------------------------------------------------------------------------- */
template<class DeviceType>
@ -732,6 +852,9 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
const int scratch_shift = team_rank * max_m_tile;
const int jelem = element(iatom, jnbor);
// See notes on data layouts for shared memory caching
// in `compute_ui`.
// double buffer for ulist
SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
@ -740,50 +863,20 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
const double x = rij(iatom,jnbor,0);
const double y = rij(iatom,jnbor,1);
const double z = rij(iatom,jnbor,2);
const double rsq = x * x + y * y + z * z;
const double r = sqrt(rsq);
const double rcut = rcutij(iatom, jnbor);
const double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
const double theta0 = (r - rmin0) * rscale0;
const double cs = cos(theta0);
const double sn = sin(theta0);
const double z0 = r * cs / sn;
const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
const CayleyKleinPack& ckp = cayleyklein(iatom, jnbor);
const double wj_local = wj(iatom, jnbor);
const double sfac = wj_local * compute_sfac(r, rcut);
const double dsfac = wj_local * compute_dsfac(r, rcut);
const double rinv = 1.0 / r;
// extract a single unit vector
const double u = (dir == 0 ? x * rinv : dir == 1 ? y * rinv : z * rinv);
// Compute Cayley-Klein parameters for unit quaternion
const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
const SNAcomplex a = { r0inv * z0, -r0inv * z };
const SNAcomplex b = { r0inv * y, -r0inv * x };
const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
const double dr0inv = dr0invdr * u;
const double dz0 = dz0dr * u;
const SNAcomplex da = { dz0 * r0inv + z0 * dr0inv,
- z * dr0inv + (dir == 2 ? - r0inv : 0.) };
const SNAcomplex db = { y * dr0inv + (dir==1?r0inv:0.),
-x * dr0inv + (dir==0?-r0inv:0.) };
const SNAcomplex a = ckp.a;
const SNAcomplex b = ckp.b;
const SNAcomplex da = ckp.da[dir];
const SNAcomplex db = ckp.db[dir];
const double sfac = ckp.sfac;
const double dsfacu = ckp.dsfacu[dir]; // dsfac * u
// Accumulate the full contribution to dedr on the fly
const double du_prod = dsfac * u; // chain rule
const SNAcomplex y_local = ylist(0, jelem, iatom);
// Symmetry factor of 0.5 b/c 0 element is on diagonal for even j==0
double dedr_full_sum = 0.5 * du_prod * y_local.re;
double dedr_full_sum = 0.5 * dsfacu * y_local.re;
// single has a warp barrier at the end
Kokkos::single(Kokkos::PerThread(team), [=]() {
@ -793,8 +886,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
});
for (int j = 1; j <= twojmax; j++) {
int jju = idxu_block[j];
int jjup = idxu_block[j-1];
int jju_half = idxu_half_block[j];
// flatten the loop over ma,mb
@ -812,11 +904,10 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
[&] (const int m, double& sum_tmp) {
// ma fast, mb slow
// Equivalent to `int ma = m % n_ma; int mb = m / n_ma;` IF everything's positive.
const int mb = m / n_ma;
const int ma = m - mb * n_ma;
const int jju_index = jju+m;
const int jju_half_index = jju_half+m;
// Load y_local, apply the symmetry scaling factor
// The "secret" of the shared memory optimization is it eliminates
@ -824,7 +915,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
// shared memory and otherwise always writing, making the kernel
// ultimately compute bound. We take advantage of that by adding
// some reads back in.
auto y_local = ylist(jju_index, jelem, iatom);
auto y_local = ylist(jju_half_index, jelem, iatom);
if (j % 2 == 0 && 2*mb == j) {
if (ma == mb) { y_local = 0.5*y_local; }
else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright
@ -870,7 +961,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
dulist_buf2[jju_shared_idx] = du_accum;
// Directly accumulate deidrj into sum_tmp
const SNAcomplex du_prod = ((dsfac * u)*u_accum) + (sfac*du_accum);
const SNAcomplex du_prod = (dsfacu * u_accum) + (sfac * du_accum);
sum_tmp += du_prod.re * y_local.re + du_prod.im * y_local.im;
// copy left side to right side with inversion symmetry VMK 4.4(2)
@ -923,8 +1014,9 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
* ----------------------------------------------------------------------*/
/* ----------------------------------------------------------------------
* compute Ui by summing over neighbors j
* ------------------------------------------------------------------------- */
Ulisttot uses a "half" data layout which takes
advantage of the symmetry of the Wigner U matrices.
* ------------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
@ -932,11 +1024,11 @@ void SNAKokkos<DeviceType>::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceT
{
for (int jelem = 0; jelem < nelements; jelem++) {
for (int j = 0; j <= twojmax; j++) {
const int jju = idxu_block(j);
const int jju = idxu_half_block(j);
// Only diagonal elements get initialized
// for (int m = 0; m < (j+1)*(j+1); m++)
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)),
// for (int m = 0; m < (j+1)*(j/2+1); m++)
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)),
[&] (const int m) {
const int jjup = jju + m;
@ -956,6 +1048,8 @@ void SNAKokkos<DeviceType>::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceT
/* ----------------------------------------------------------------------
compute Ui by summing over bispectrum components. CPU only.
See comments above compute_uarray_cpu and add_uarraytot for
data layout comments.
------------------------------------------------------------------------- */
template<class DeviceType>
@ -1026,10 +1120,10 @@ void SNAKokkos<DeviceType>::compute_zi_cpu(const int& iter)
int ma2 = ma2max;
int icga = ma1min * (j2 + 1) + ma2max;
for(int ia = 0; ia < na; ia++) {
suma1_r += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).re -
ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).im);
suma1_i += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).im +
ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).re);
suma1_r += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).re -
ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).im);
suma1_i += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).im +
ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).re);
ma1++;
ma2--;
icga += j2;
@ -1098,8 +1192,8 @@ void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<Dev
const int jjz_index = jjz + mb * (j + 1) + ma;
if (2*mb == j) return;
sum +=
ulisttot(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
ulisttot(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
},sumzu_temp); // end loop over ma, mb
sumzu += sumzu_temp;
@ -1113,8 +1207,8 @@ void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<Dev
const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
sum +=
ulisttot(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
ulisttot(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
},sumzu_temp); // end loop over ma
sumzu += sumzu_temp;
@ -1122,8 +1216,8 @@ void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<Dev
const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
sumzu += 0.5*
(ulisttot(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
ulisttot(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im);
(ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im);
} // end if jeven
Kokkos::single(Kokkos::PerThread(team), [&] () {
@ -1154,17 +1248,6 @@ void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<Dev
}
/* ----------------------------------------------------------------------
compute Yi from Ui without storing Zi, looping over zlist indices
------------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::zero_yi_cpu(const int& idx, const int& iatom, const int& ielem)
{
ylist(idx,ielem,iatom) = {0.0, 0.0};
}
/* ----------------------------------------------------------------------
compute Yi from Ui without storing Zi, looping over zlist indices,
CPU version
@ -1188,7 +1271,7 @@ void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,
const int mb2max = idxz(jjz, 6);
const int na = idxz(jjz, 7);
const int nb = idxz(jjz, 8);
const int jju = idxz(jjz, 9);
const int jju_half = idxz(jjz, 9);
const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
//int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
@ -1214,10 +1297,10 @@ void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,
int icga = ma1min*(j2+1) + ma2max;
for (int ia = 0; ia < na; ia++) {
suma1_r += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).re -
ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).im);
suma1_i += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).im +
ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).re);
suma1_r += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).re -
ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).im);
suma1_i += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).im +
ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).re);
ma1++;
ma2--;
icga += j2;
@ -1264,8 +1347,8 @@ void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,
if (!bnorm_flag && j1 > j)
betaj *= (j1 + 1) / (j + 1.0);
Kokkos::atomic_add(&(ylist(jju, elem3, iatom).re), betaj*ztmp_r);
Kokkos::atomic_add(&(ylist(jju, elem3, iatom).im), betaj*ztmp_i);
Kokkos::atomic_add(&(ylist(jju_half, elem3, iatom).re), betaj*ztmp_r);
Kokkos::atomic_add(&(ylist(jju_half, elem3, iatom).im), betaj*ztmp_i);
} // end loop over elem3
} // end loop over elem2
} // end loop over elem1
@ -1274,6 +1357,8 @@ void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,
/* ----------------------------------------------------------------------
calculate derivative of Ui w.r.t. atom j
see comments above compute_duarray_cpu for comments on the
data layout
------------------------------------------------------------------------- */
template<class DeviceType>
@ -1301,6 +1386,10 @@ void SNAKokkos<DeviceType>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy
/* ----------------------------------------------------------------------
compute dEidRj, CPU path only.
dulist takes advantage of a `cached` data layout, similar to the
shared memory layout for the GPU routines, which is efficient for
compressing the calculation in compute_duarray_cpu. That said,
dulist only uses the "half" data layout part of that structure.
------------------------------------------------------------------------- */
@ -1314,14 +1403,18 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy
//for(int j = 0; j <= twojmax; j++) {
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1),
[&] (const int& j, t_scalar3<double>& sum_tmp) {
int jju = idxu_block[j];
int jju_half = idxu_half_block[j];
int jju_cache = idxu_cache_block[j];
for(int mb = 0; 2*mb < j; mb++)
for(int ma = 0; ma <= j; ma++) {
sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im;
sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im;
sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im;
jju++;
sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re +
dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im;
sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re +
dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im;
sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re +
dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im;
jju_half++; jju_cache++;
} //end loop over ma mb
// For j even, handle middle column
@ -1330,16 +1423,22 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy
int mb = j/2;
for(int ma = 0; ma < mb; ma++) {
sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im;
sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im;
sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im;
jju++;
sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re +
dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im;
sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re +
dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im;
sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re +
dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im;
jju_half++; jju_cache++;
}
//int ma = mb;
sum_tmp.x += (dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im)*0.5;
sum_tmp.y += (dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im)*0.5;
sum_tmp.z += (dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im)*0.5;
sum_tmp.x += (dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re +
dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im)*0.5;
sum_tmp.y += (dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re +
dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im)*0.5;
sum_tmp.z += (dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re +
dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im)*0.5;
} // end if jeven
},final_sum); // end loop over j
@ -1355,6 +1454,10 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy
/* ----------------------------------------------------------------------
add Wigner U-functions for one neighbor to the total
ulist is in a "cached" data layout, which is a compressed layout
which still keeps the recursive calculation simple. On the other hand
`ulisttot` uses a "half" data layout, which fully takes advantage
of the symmetry of the Wigner U matrices.
------------------------------------------------------------------------- */
template<class DeviceType>
@ -1364,15 +1467,27 @@ void SNAKokkos<DeviceType>::add_uarraytot(const typename Kokkos::TeamPolicy<Devi
{
const double sfac = compute_sfac(r, rcut) * wj;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(0)),
[&] (const int& i) {
Kokkos::atomic_add(&(ulisttot(i,jelem,iatom).re), sfac * ulist(i,iatom,jnbor).re);
Kokkos::atomic_add(&(ulisttot(i,jelem,iatom).im), sfac * ulist(i,iatom,jnbor).im);
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1),
[&] (const int& j) {
int jju_half = idxu_half_block[j]; // index into ulisttot
int jju_cache = idxu_cache_block[j]; // index into ulist
int count = 0;
for (int mb = 0; 2*mb <= j; mb++) {
for (int ma = 0; ma <= j; ma++) {
Kokkos::atomic_add(&(ulisttot(jju_half+count, jelem, iatom).re), sfac * ulist(jju_cache+count, iatom, jnbor).re);
Kokkos::atomic_add(&(ulisttot(jju_half+count, jelem, iatom).im), sfac * ulist(jju_cache+count, iatom, jnbor).im);
count++;
}
}
});
}
/* ----------------------------------------------------------------------
compute Wigner U-functions for one neighbor
compute Wigner U-functions for one neighbor.
`ulisttot` uses a "cached" data layout, matching the amount of
information stored between layers via scratch memory on the GPU path
------------------------------------------------------------------------- */
template<class DeviceType>
@ -1399,8 +1514,8 @@ void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy
ulist(0,iatom,jnbor).im = 0.0;
for (int j = 1; j <= twojmax; j++) {
int jju = idxu_block[j];
int jjup = idxu_block[j-1];
const int jju = idxu_cache_block[j];
const int jjup = idxu_cache_block[j-1];
// fill in left side of matrix layer from previous layer
@ -1434,37 +1549,37 @@ void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy
(b_r * ulist(jjup_index,iatom,jnbor).im -
b_i * ulist(jjup_index,iatom,jnbor).re);
}
});
// copy left side to right side with inversion symmetry VMK 4.4(2)
// u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
// copy left side to right side with inversion symmetry VMK 4.4(2)
// u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
jju = idxu_block[j];
jjup = jju+(j+1)*(j+1)-1;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
[&] (const int& mb) {
// for (int mb = 0; 2*mb <= j; mb++) {
int mbpar = (mb)%2==0?1:-1;
int mapar = mbpar;
for (int ma = 0; ma <= j; ma++) {
const int jju_index = jju+mb*(j+1)+ma;
const int jjup_index = jjup-mb*(j+1)-ma;
if (mapar == 1) {
ulist(jjup_index,iatom,jnbor).re = ulist(jju_index,iatom,jnbor).re;
ulist(jjup_index,iatom,jnbor).im = -ulist(jju_index,iatom,jnbor).im;
} else {
ulist(jjup_index,iatom,jnbor).re = -ulist(jju_index,iatom,jnbor).re;
ulist(jjup_index,iatom,jnbor).im = ulist(jju_index,iatom,jnbor).im;
// Only need to add one symmetrized row for convenience
// Symmetry gets "unfolded" in accumulating ulisttot
if (j%2==1 && mb==(j/2)) {
const int mbpar = (mb)%2==0?1:-1;
int mapar = mbpar;
for (int ma = 0; ma <= j; ma++) {
const int jju_index = jju + mb*(j+1) + ma;
const int jjup_index = jju + (j+1-mb)*(j+1)-(ma+1);
if (mapar == 1) {
ulist(jjup_index,iatom,jnbor).re = ulist(jju_index,iatom,jnbor).re;
ulist(jjup_index,iatom,jnbor).im = -ulist(jju_index,iatom,jnbor).im;
} else {
ulist(jjup_index,iatom,jnbor).re = -ulist(jju_index,iatom,jnbor).re;
ulist(jjup_index,iatom,jnbor).im = ulist(jju_index,iatom,jnbor).im;
}
mapar = -mapar;
}
mapar = -mapar;
}
});
}
}
/* ----------------------------------------------------------------------
compute derivatives of Wigner U-functions for one neighbor
see comments in compute_uarray_cpu()
Uses same cached data layout of ulist
------------------------------------------------------------------------- */
template<class DeviceType>
@ -1524,8 +1639,8 @@ double r0inv;
dulist(0,iatom,jnbor,2).im = 0.0;
for (int j = 1; j <= twojmax; j++) {
int jju = idxu_block[j];
int jjup = idxu_block[j-1];
int jju = idxu_cache_block[j];
int jjup = idxu_cache_block[j-1];
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
[&] (const int& mb) {
//for (int mb = 0; 2*mb <= j; mb++) {
@ -1568,33 +1683,32 @@ double r0inv;
b_i * dulist(jjup_index,iatom,jnbor,k).re);
}
}
});
// copy left side to right side with inversion symmetry VMK 4.4(2)
// u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
// Only need to add one symmetrized row for convenience
// Symmetry gets "unfolded" during the dedr accumulation
jju = idxu_block[j];
jjup = jju+(j+1)*(j+1)-1;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
[&] (const int& mb) {
// for (int mb = 0; 2*mb <= j; mb++) {
int mbpar = (mb)%2==0?1:-1;
int mapar = mbpar;
for (int ma = 0; ma <= j; ma++) {
const int jju_index = jju+mb*(j+1)+ma;
const int jjup_index = jjup-mb*(j+1)-ma;
if (mapar == 1) {
for (int k = 0; k < 3; k++) {
dulist(jjup_index,iatom,jnbor,k).re = dulist(jju_index,iatom,jnbor,k).re;
dulist(jjup_index,iatom,jnbor,k).im = -dulist(jju_index,iatom,jnbor,k).im;
}
} else {
for (int k = 0; k < 3; k++) {
dulist(jjup_index,iatom,jnbor,k).re = -dulist(jju_index,iatom,jnbor,k).re;
dulist(jjup_index,iatom,jnbor,k).im = dulist(jju_index,iatom,jnbor,k).im;
// copy left side to right side with inversion symmetry VMK 4.4(2)
// u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
if (j%2==1 && mb==(j/2)) {
const int mbpar = (mb)%2==0?1:-1;
int mapar = mbpar;
for (int ma = 0; ma <= j; ma++) {
const int jju_index = jju+mb*(j+1)+ma;
const int jjup_index = jju+(mb+2)*(j+1)-(ma+1);
if (mapar == 1) {
for (int k = 0; k < 3; k++) {
dulist(jjup_index,iatom,jnbor,k).re = dulist(jju_index,iatom,jnbor,k).re;
dulist(jjup_index,iatom,jnbor,k).im = -dulist(jju_index,iatom,jnbor,k).im;
}
} else {
for (int k = 0; k < 3; k++) {
dulist(jjup_index,iatom,jnbor,k).re = -dulist(jju_index,iatom,jnbor,k).re;
dulist(jjup_index,iatom,jnbor,k).im = dulist(jju_index,iatom,jnbor,k).im;
}
}
mapar = -mapar;
}
mapar = -mapar;
}
});
}
@ -1605,8 +1719,11 @@ double r0inv;
sfac *= wj;
dsfac *= wj;
// Even though we fill out a full "cached" data layout above,
// we only need the "half" data for the accumulation into dedr.
// Thus we skip updating any unnecessary data.
for (int j = 0; j <= twojmax; j++) {
int jju = idxu_block[j];
int jju = idxu_cache_block[j];
for (int mb = 0; 2*mb <= j; mb++)
for (int ma = 0; ma <= j; ma++) {
dulist(jju,iatom,jnbor,0).re = dsfac * ulist(jju,iatom,jnbor).re * ux +
@ -1980,6 +2097,24 @@ double SNAKokkos<DeviceType>::compute_dsfac(double r, double rcut)
return 0.0;
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_s_dsfac(const double r, const double rcut, double& sfac, double& dsfac) {
if (switch_flag == 0) { sfac = 0.; dsfac = 0.; }
else if (switch_flag == 1) {
if (r <= rmin0) { sfac = 1.0; dsfac = 0.0; }
else if (r > rcut) { sfac = 0.; dsfac = 0.; }
else {
const double rcutfac = MY_PI / (rcut - rmin0);
double sn, cs;
sincos((r - rmin0) * rcutfac, &sn, &cs);
sfac = 0.5 * (cs + 1.0);
dsfac = -0.5 * sn * rcutfac;
}
} else { sfac = 0.; dsfac = 0.; }
}
/* ---------------------------------------------------------------------- */
// efficient complex FMA (i.e., y += a x)
@ -2031,30 +2166,31 @@ double SNAKokkos<DeviceType>::memory_usage()
#ifdef LMP_KOKKOS_GPU
if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
int natom_pad = ((natom + 32 - 1) / 32) * 32; // for AoSoA layouts
auto natom_pad = (natom+32-1)/32;
bytes += natom * idxu_max * nelements * sizeof(double); // ulisttot_re
bytes += natom * idxu_max * nelements * sizeof(double); // ulisttot_im
bytes += natom * idxu_half_max * nelements * sizeof(double); // ulisttot_re
bytes += natom * idxu_half_max * nelements * sizeof(double); // ulisttot_im
bytes += natom_pad * idxu_max * nelements * sizeof(double) * 2; // ulisttot_pack
bytes += natom_pad * idxz_max * ndoubles * sizeof(double) * 2; // zlist_pack
bytes += natom_pad * idxb_max * ntriples * sizeof(double); // blist_pack
bytes += natom_pad * idxu_max * nelements * sizeof(double); // ylist_pack_re
bytes += natom_pad * idxu_max * nelements * sizeof(double); // ylist_pack_im
bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ylist
bytes += natom_pad * idxu_half_max * nelements * sizeof(double); // ylist_pack_re
bytes += natom_pad * idxu_half_max * nelements * sizeof(double); // ylist_pack_im
bytes += natom * idxu_half_max * nelements * sizeof(double) * 2; // ylist
} else {
#endif
bytes += natom * nmax * idxu_max * sizeof(double) * 2; // ulist
bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ulisttot
bytes += natom * nmax * idxu_cache_max * sizeof(double) * 2; // ulist
bytes += natom * idxu_half_max * nelements * sizeof(double) * 2; // ulisttot
bytes += natom * idxz_max * ndoubles * sizeof(double) * 2; // zlist
bytes += natom * idxb_max * ntriples * sizeof(double); // blist
bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ylist
bytes += natom * idxu_half_max * nelements * sizeof(double) * 2; // ylist
bytes += natom * nmax * idxu_max * 3 * sizeof(double) * 2; // dulist
bytes += natom * nmax * idxu_cache_max * 3 * sizeof(double) * 2; // dulist
#ifdef LMP_KOKKOS_GPU
}
#endif
@ -2063,6 +2199,8 @@ double SNAKokkos<DeviceType>::memory_usage()
bytes += jdim * jdim * jdim * sizeof(int); // idxcg_block
bytes += jdim * sizeof(int); // idxu_block
bytes += jdim * sizeof(int); // idxu_half_block
bytes += jdim * sizeof(int); // idxu_cache_block
bytes += jdim * jdim * jdim * sizeof(int); // idxz_block
bytes += jdim * jdim * jdim * sizeof(int); // idxb_block