forked from lijiext/lammps
Added support for symmetrized memory layouts for ui/duidrj for the CPU- and GPU-path SNAP Kokkos implementation, various perf optimizations for ComputeUi/ComputeFusedDeidrj
This commit is contained in:
parent
ac43f8f685
commit
a5d27763e5
|
@ -1058,7 +1058,7 @@ struct alignas(2*sizeof(real)) SNAComplex
|
|||
{
|
||||
real re,im;
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION SNAComplex() = default;
|
||||
SNAComplex() = default;
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real re)
|
||||
: re(re), im(static_cast<real>(0.)) { ; }
|
||||
|
@ -1100,6 +1100,17 @@ KOKKOS_FORCEINLINE_FUNCTION SNAComplex<real> operator*(const real& r, const SNAC
|
|||
|
||||
typedef SNAComplex<SNAreal> SNAcomplex;
|
||||
|
||||
// Cayley-Klein pack
|
||||
// Can guarantee it's aligned to 2 complex
|
||||
struct alignas(32) CayleyKleinPack {
|
||||
|
||||
SNAcomplex a, b;
|
||||
SNAcomplex da[3], db[3];
|
||||
SNAreal sfac;
|
||||
SNAreal dsfacu[3];
|
||||
|
||||
};
|
||||
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CXX11)
|
||||
#undef ISFINITE
|
||||
|
|
|
@ -50,6 +50,7 @@ struct TagPairSNAPComputeFusedDeidrj{};
|
|||
// CPU backend only
|
||||
struct TagPairSNAPPreUiCPU{};
|
||||
struct TagPairSNAPComputeUiCPU{};
|
||||
struct TagPairSNAPTransformUiCPU{};
|
||||
struct TagPairSNAPComputeZiCPU{};
|
||||
struct TagPairSNAPBetaCPU{};
|
||||
struct TagPairSNAPComputeBiCPU{};
|
||||
|
@ -104,7 +105,7 @@ public:
|
|||
void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const;
|
||||
void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
|
||||
|
@ -134,15 +135,15 @@ public:
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeZiCPU,const int& ii) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBiCPU>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYiCPU>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;
|
||||
|
||||
|
|
|
@ -206,8 +206,6 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
|||
|
||||
EV_FLOAT ev;
|
||||
|
||||
int idxu_max = snaKK.idxu_max;
|
||||
|
||||
while (chunk_offset < inum) { // chunk up loop to prevent running out of memory
|
||||
|
||||
EV_FLOAT ev_tmp;
|
||||
|
@ -246,6 +244,13 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
|||
Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
|
||||
}
|
||||
|
||||
{
|
||||
// Expand ulisttot -> ulisttot_full
|
||||
// Zero out ylist
|
||||
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUiCPU> policy_transform_ui_cpu({0,0},{twojmax+1,chunk_size});
|
||||
Kokkos::parallel_for("TransformUiCPU",policy_transform_ui_cpu,*this);
|
||||
}
|
||||
|
||||
//Compute bispectrum
|
||||
if (quadraticflag || eflag) {
|
||||
//ComputeZi
|
||||
|
@ -261,20 +266,12 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
|||
Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
|
||||
}
|
||||
|
||||
//ZeroYi,ComputeYi
|
||||
//ComputeYi
|
||||
{
|
||||
int vector_length = vector_length_default;
|
||||
int team_size = team_size_default;
|
||||
|
||||
//Compute beta = dE_i/dB_i for all i in list
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBetaCPU> policy_beta(0,chunk_size);
|
||||
Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this);
|
||||
|
||||
//ZeroYi
|
||||
check_team_size_for<TagPairSNAPZeroYiCPU>(chunk_size,team_size,vector_length);
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
|
||||
Kokkos::parallel_for("ZeroYiCPU",policy_zero_yi,*this);
|
||||
|
||||
//ComputeYi
|
||||
int idxz_max = snaKK.idxz_max;
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiCPU> policy_yi_cpu(0,chunk_size*idxz_max);
|
||||
|
@ -294,6 +291,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
|||
|
||||
Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
|
||||
}
|
||||
|
||||
} else { // GPU
|
||||
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
|
@ -313,10 +311,10 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
|||
int team_size = 4; // need to cap b/c of shared memory reqs
|
||||
check_team_size_for<TagPairSNAPComputeUi>(chunk_size,team_size,vector_length);
|
||||
|
||||
// scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
|
||||
// scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values, div 2 for symmetry
|
||||
// 2 is for double buffer
|
||||
|
||||
const int tile_size = (twojmax+1)*(twojmax+1);
|
||||
const int tile_size = (twojmax+1)*(twojmax/2+1);
|
||||
typedef Kokkos::View< SNAcomplex*,
|
||||
Kokkos::DefaultExecutionSpace::scratch_memory_space,
|
||||
Kokkos::MemoryTraits<Kokkos::Unmanaged> >
|
||||
|
@ -329,7 +327,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
|||
Kokkos::parallel_for("ComputeUi",policy_ui,*this);
|
||||
|
||||
//Transform data layout of ulisttot to AoSoA, zero ylist
|
||||
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
|
||||
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,twojmax+1,(chunk_size + 32 - 1) / 32},{32,4,1});
|
||||
Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
|
||||
|
||||
}
|
||||
|
@ -367,7 +365,8 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
|||
Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
|
||||
|
||||
//Transform data layout of ylist out of AoSoA
|
||||
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
|
||||
const int idxu_half_max = snaKK.idxu_half_max;
|
||||
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_half_max,(chunk_size + 32 - 1) / 32},{32,4,1});
|
||||
Kokkos::parallel_for("TransformYi",policy_transform_yi,*this);
|
||||
|
||||
}
|
||||
|
@ -397,7 +396,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
|||
}
|
||||
}
|
||||
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
#endif // LMP_KOKKOS_GPU
|
||||
|
||||
}
|
||||
|
||||
|
@ -608,12 +607,21 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen
|
|||
|
||||
if ( rsq < rnd_cutsq(itype,jtype) ) {
|
||||
if (final) {
|
||||
my_sna.rij(ii,offset,0) = dx;
|
||||
my_sna.rij(ii,offset,1) = dy;
|
||||
my_sna.rij(ii,offset,2) = dz;
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
|
||||
my_sna.compute_cayley_klein(ii, offset, dx, dy, dz, (radi + d_radelem[elem_j])*rcutfac,
|
||||
d_wjelem[elem_j]);
|
||||
} else {
|
||||
#endif
|
||||
my_sna.rij(ii,offset,0) = dx;
|
||||
my_sna.rij(ii,offset,1) = dy;
|
||||
my_sna.rij(ii,offset,2) = dz;
|
||||
my_sna.wj(ii,offset) = d_wjelem[elem_j];
|
||||
my_sna.rcutij(ii,offset) = (radi + d_radelem[elem_j])*rcutfac;
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
}
|
||||
#endif
|
||||
my_sna.inside(ii,offset) = j;
|
||||
my_sna.wj(ii,offset) = d_wjelem[elem_j];
|
||||
my_sna.rcutij(ii,offset) = (radi + d_radelem[elem_j])*rcutfac;
|
||||
if (chemflag)
|
||||
my_sna.element(ii,offset) = elem_j;
|
||||
else
|
||||
|
@ -704,27 +712,56 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
|
|||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * 32;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
if (idxu >= my_sna.idxu_max) return;
|
||||
if (j > twojmax) return;
|
||||
|
||||
int elem_count = chemflag ? nelements : 1;
|
||||
|
||||
for (int ielem = 0; ielem < elem_count; ielem++) {
|
||||
const int jju_half = my_sna.idxu_half_block(j);
|
||||
const int jju = my_sna.idxu_block(j);
|
||||
|
||||
const auto utot_re = my_sna.ulisttot_re(idxu, ielem, iatom);
|
||||
const auto utot_im = my_sna.ulisttot_im(idxu, ielem, iatom);
|
||||
for (int mb = 0; 2*mb <= j; mb++) {
|
||||
for (int ma = 0; ma <= j; ma++) {
|
||||
// Extract top half
|
||||
|
||||
my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
|
||||
const int idxu_shift = mb * (j + 1) + ma;
|
||||
const int idxu_half = jju_half + idxu_shift;
|
||||
const int idxu = jju + idxu_shift;
|
||||
|
||||
my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div) = 0.;
|
||||
my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div) = 0.;
|
||||
auto utot_re = my_sna.ulisttot_re(idxu_half, ielem, iatom);
|
||||
auto utot_im = my_sna.ulisttot_im(idxu_half, ielem, iatom);
|
||||
|
||||
// Store
|
||||
my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
|
||||
|
||||
// Also zero yi
|
||||
my_sna.ylist_pack_re(iatom_mod, idxu_half, ielem, iatom_div) = 0.;
|
||||
my_sna.ylist_pack_im(iatom_mod, idxu_half, ielem, iatom_div) = 0.;
|
||||
|
||||
// Symmetric term
|
||||
const int sign_factor = (((ma+mb)%2==0)?1:-1);
|
||||
const int idxu_flip = jju + (j + 1 - mb) * (j + 1) - (ma + 1);
|
||||
|
||||
if (sign_factor == 1) {
|
||||
utot_im = -utot_im;
|
||||
} else {
|
||||
utot_re = -utot_re;
|
||||
}
|
||||
|
||||
my_sna.ulisttot_pack(iatom_mod, idxu_flip, ielem, iatom_div) = { utot_re, utot_im };
|
||||
|
||||
// No need to zero symmetrized ylist
|
||||
//my_sna.ylist_pack_re(iatom_mod, idxu_flip, ielem, iatom_div) = 0.;
|
||||
//my_sna.ylist_pack_im(iatom_mod, idxu_flip, ielem, iatom_div) = 0.;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
|
@ -742,20 +779,20 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int iato
|
|||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const {
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu_half, const int iatom_div) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * 32;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
if (idxu >= my_sna.idxu_max) return;
|
||||
if (idxu_half >= my_sna.idxu_half_max) return;
|
||||
|
||||
int elem_count = chemflag ? nelements : 1;
|
||||
for (int ielem = 0; ielem < elem_count; ielem++) {
|
||||
const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div);
|
||||
const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div);
|
||||
const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu_half, ielem, iatom_div);
|
||||
const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu_half, ielem, iatom_div);
|
||||
|
||||
my_sna.ylist(idxu, ielem, iatom) = { y_re, y_im };
|
||||
my_sna.ylist(idxu_half, ielem, iatom) = { y_re, y_im };
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -904,22 +941,52 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typen
|
|||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU>::member_type& team) const {
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
// Extract the quantum number
|
||||
const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
|
||||
if (idx >= my_sna.idxu_max) return;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
// Extract the atomic index
|
||||
const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
|
||||
if (ii >= chunk_size) return;
|
||||
if (j > twojmax) return;
|
||||
|
||||
if (chemflag)
|
||||
for(int ielem = 0; ielem < nelements; ielem++)
|
||||
my_sna.zero_yi_cpu(idx,ii,ielem);
|
||||
else
|
||||
my_sna.zero_yi_cpu(idx,ii,0);
|
||||
int elem_count = chemflag ? nelements : 1;
|
||||
|
||||
// De-symmetrize ulisttot
|
||||
for (int ielem = 0; ielem < elem_count; ielem++) {
|
||||
|
||||
const int jju_half = my_sna.idxu_half_block(j);
|
||||
const int jju = my_sna.idxu_block(j);
|
||||
|
||||
for (int mb = 0; 2*mb <= j; mb++) {
|
||||
for (int ma = 0; ma <= j; ma++) {
|
||||
// Extract top half
|
||||
|
||||
const int idxu_shift = mb * (j + 1) + ma;
|
||||
const int idxu_half = jju_half + idxu_shift;
|
||||
const int idxu = jju + idxu_shift;
|
||||
|
||||
// Load ulist
|
||||
auto utot = my_sna.ulisttot(idxu_half, ielem, iatom);
|
||||
|
||||
// Store
|
||||
my_sna.ulisttot_full(idxu, ielem, iatom) = utot;
|
||||
|
||||
// Zero Yi
|
||||
my_sna.ylist(idxu_half, ielem, iatom) = {0., 0.};
|
||||
|
||||
// Symmetric term
|
||||
const int sign_factor = (((ma+mb)%2==0)?1:-1);
|
||||
const int idxu_flip = jju + (j + 1 - mb) * (j + 1) - (ma + 1);
|
||||
|
||||
if (sign_factor == 1) {
|
||||
utot.im = -utot.im;
|
||||
} else {
|
||||
utot.re = -utot.re;
|
||||
}
|
||||
|
||||
my_sna.ulisttot_full(idxu_flip, ielem, iatom) = utot;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
|
|
|
@ -55,6 +55,8 @@ public:
|
|||
typedef Kokkos::View<SNAcomplex**[3], DeviceType> t_sna_3c3;
|
||||
typedef Kokkos::View<SNAcomplex*****, DeviceType> t_sna_5c;
|
||||
|
||||
typedef Kokkos::View<CayleyKleinPack**, DeviceType> t_sna_2ckp;
|
||||
|
||||
inline
|
||||
SNAKokkos() {};
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -78,6 +80,9 @@ inline
|
|||
|
||||
// functions for bispectrum coefficients, GPU only
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_cayley_klein(const int&, const int&, const double&, const double&,
|
||||
const double&, const double&, const double&);
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); // ForceSNAP
|
||||
|
@ -97,8 +102,6 @@ inline
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_zi_cpu(const int&); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void zero_yi_cpu(const int&,const int&,const int&); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_yi_cpu(int,
|
||||
const Kokkos::View<F_FLOAT**, DeviceType> &beta); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -117,6 +120,8 @@ inline
|
|||
double compute_sfac(double, double); // add_uarraytot, compute_duarray
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double compute_dsfac(double, double); // compute_duarray
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_s_dsfac(const double, const double, double&, double&); // compute_cayley_klein
|
||||
|
||||
// efficient complex FMA
|
||||
// efficient caxpy (i.e., y += a x)
|
||||
|
@ -140,6 +145,9 @@ inline
|
|||
|
||||
//per sna class instance for OMP use
|
||||
|
||||
// Alternative to rij, wj, rcutij...
|
||||
// just calculate everything up front
|
||||
t_sna_2ckp cayleyklein;
|
||||
|
||||
// Per InFlight Particle
|
||||
t_sna_3d rij;
|
||||
|
@ -156,6 +164,7 @@ inline
|
|||
|
||||
t_sna_3d_ll blist;
|
||||
t_sna_3c_ll ulisttot;
|
||||
t_sna_3c_ll ulisttot_full; // un-folded ulisttot, cpu only
|
||||
t_sna_3c_ll zlist;
|
||||
|
||||
t_sna_3c_ll ulist;
|
||||
|
@ -173,7 +182,7 @@ inline
|
|||
t_sna_4d_ll ylist_pack_re; // split real,
|
||||
t_sna_4d_ll ylist_pack_im; // imag AoSoA layout
|
||||
|
||||
int idxcg_max, idxu_max, idxz_max, idxb_max;
|
||||
int idxcg_max, idxu_max, idxu_half_max, idxu_cache_max, idxz_max, idxb_max;
|
||||
|
||||
// Chem snap counts
|
||||
int nelements;
|
||||
|
@ -188,7 +197,13 @@ private:
|
|||
Kokkos::View<int*[10], DeviceType> idxz;
|
||||
Kokkos::View<int*[3], DeviceType> idxb;
|
||||
Kokkos::View<int***, DeviceType> idxcg_block;
|
||||
|
||||
public:
|
||||
Kokkos::View<int*, DeviceType> idxu_block;
|
||||
Kokkos::View<int*, DeviceType> idxu_half_block;
|
||||
Kokkos::View<int*, DeviceType> idxu_cache_block;
|
||||
|
||||
private:
|
||||
Kokkos::View<int***, DeviceType> idxz_block;
|
||||
Kokkos::View<int***, DeviceType> idxb_block;
|
||||
|
||||
|
|
|
@ -120,6 +120,36 @@ void SNAKokkos<DeviceType>::build_indexlist()
|
|||
idxu_max = idxu_count;
|
||||
Kokkos::deep_copy(idxu_block,h_idxu_block);
|
||||
|
||||
// index list for half uarray
|
||||
idxu_half_block = Kokkos::View<int*, DeviceType>("SNAKokkos::idxu_half_block",jdim);
|
||||
auto h_idxu_half_block = Kokkos::create_mirror_view(idxu_half_block);
|
||||
|
||||
int idxu_half_count = 0;
|
||||
for(int j = 0; j <= twojmax; j++) {
|
||||
h_idxu_half_block[j] = idxu_half_count;
|
||||
for(int mb = 0; 2*mb <= j; mb++)
|
||||
for(int ma = 0; ma <= j; ma++)
|
||||
idxu_half_count++;
|
||||
}
|
||||
idxu_half_max = idxu_half_count;
|
||||
Kokkos::deep_copy(idxu_half_block, h_idxu_half_block);
|
||||
|
||||
// index list for "cache" uarray
|
||||
// this is the GPU scratch memory requirements
|
||||
// applied the CPU structures
|
||||
idxu_cache_block = Kokkos::View<int*, DeviceType>("SNAKokkos::idxu_cache_block",jdim);
|
||||
auto h_idxu_cache_block = Kokkos::create_mirror_view(idxu_cache_block);
|
||||
|
||||
int idxu_cache_count = 0;
|
||||
for(int j = 0; j <= twojmax; j++) {
|
||||
h_idxu_cache_block[j] = idxu_cache_count;
|
||||
for(int mb = 0; mb < ((j+3)/2); mb++)
|
||||
for (int ma = 0; ma <= j; ma++)
|
||||
idxu_cache_count++;
|
||||
}
|
||||
idxu_cache_max = idxu_cache_count;
|
||||
Kokkos::deep_copy(idxu_cache_block, h_idxu_cache_block);
|
||||
|
||||
// index list for beta and B
|
||||
|
||||
int idxb_count = 0;
|
||||
|
@ -201,15 +231,21 @@ void SNAKokkos<DeviceType>::build_indexlist()
|
|||
h_idxz(idxz_count,8) = MIN(j1, (2 * mb - j + j2 + j1) / 2) - h_idxz(idxz_count,5) + 1;
|
||||
|
||||
// apply to z(j1,j2,j,ma,mb) to unique element of y(j)
|
||||
// ylist is "compressed" via symmetry in its
|
||||
// contraction with dulist
|
||||
const int jju_half = h_idxu_half_block[j] + (j+1)*mb + ma;
|
||||
h_idxz(idxz_count,9) = jju_half;
|
||||
|
||||
const int jju = h_idxu_block[j] + (j+1)*mb + ma;
|
||||
h_idxz(idxz_count,9) = jju;
|
||||
// no longer needed
|
||||
//const int jju = h_idxu_block[j] + (j+1)*mb + ma;
|
||||
//h_idxz(idxz_count,9) = jju;
|
||||
|
||||
idxz_count++;
|
||||
}
|
||||
}
|
||||
Kokkos::deep_copy(idxz,h_idxz);
|
||||
Kokkos::deep_copy(idxz_block,h_idxz_block);
|
||||
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
@ -230,44 +266,47 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
|
|||
natom = newnatom;
|
||||
nmax = newnmax;
|
||||
|
||||
rij = t_sna_3d("sna:rij",natom,nmax,3);
|
||||
inside = t_sna_2i("sna:inside",natom,nmax);
|
||||
wj = t_sna_2d("sna:wj",natom,nmax);
|
||||
rcutij = t_sna_2d("sna:rcutij",natom,nmax);
|
||||
element = t_sna_2i("sna:rcutij",natom,nmax);
|
||||
dedr = t_sna_3d("sna:dedr",natom,nmax,3);
|
||||
inside = t_sna_2i(Kokkos::ViewAllocateWithoutInitializing("sna:inside"),natom,nmax);
|
||||
element = t_sna_2i(Kokkos::ViewAllocateWithoutInitializing("sna:rcutij"),natom,nmax);
|
||||
dedr = t_sna_3d(Kokkos::ViewAllocateWithoutInitializing("sna:dedr"),natom,nmax,3);
|
||||
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
|
||||
// dummy allocation
|
||||
ulisttot = t_sna_3c_ll("sna:ulisttot",1,1,1);
|
||||
ulisttot_re = t_sna_3d_ll("sna:ulisttot_re",idxu_max,nelements,natom);
|
||||
ulisttot_im = t_sna_3d_ll("sna:ulisttot_im",idxu_max,nelements,natom);
|
||||
ulisttot_pack = t_sna_4c_ll("sna:ulisttot_pack",32,idxu_max,nelements,(natom+32-1)/32);
|
||||
ulist = t_sna_3c_ll("sna:ulist",1,1,1);
|
||||
zlist = t_sna_3c_ll("sna:zlist",1,1,1);
|
||||
zlist_pack = t_sna_4c_ll("sna:zlist_pack",32,idxz_max,ndoubles,(natom+32-1)/32);
|
||||
blist = t_sna_3d_ll("sna:blist",idxb_max,ntriples,natom);
|
||||
blist_pack = t_sna_4d_ll("sna:blist_pack",32,idxb_max,ntriples,(natom+32-1)/32);
|
||||
ylist = t_sna_3c_ll("sna:ylist",idxu_max,nelements,natom);
|
||||
ylist_pack_re = t_sna_4d_ll("sna:ylist_pack_re",32,idxu_max,nelements,(natom+32-1)/32);
|
||||
ylist_pack_im = t_sna_4d_ll("sna:ylist_pack_im",32,idxu_max,nelements,(natom+32-1)/32);
|
||||
dulist = t_sna_4c3_ll("sna:dulist",1,1,1);
|
||||
|
||||
cayleyklein = t_sna_2ckp(Kokkos::ViewAllocateWithoutInitializing("sna:cayleyklein"), natom, nmax);
|
||||
ulisttot = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot"),1,1,1); // dummy allocation
|
||||
ulisttot_full = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot"),1,1,1);
|
||||
ulisttot_re = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_re"),idxu_half_max,nelements,natom);
|
||||
ulisttot_im = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_im"),idxu_half_max,nelements,natom);
|
||||
ulisttot_pack = t_sna_4c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_pack"),32,idxu_max,nelements,(natom+32-1)/32);
|
||||
ulist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulist"),1,1,1);
|
||||
zlist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:zlist"),1,1,1);
|
||||
zlist_pack = t_sna_4c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:zlist_pack"),32,idxz_max,ndoubles,(natom+32-1)/32);
|
||||
blist = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:blist"),idxb_max,ntriples,natom);
|
||||
blist_pack = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:blist_pack"),32,idxb_max,ntriples,(natom+32-1)/32);
|
||||
ylist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist"),idxu_half_max,nelements,natom);
|
||||
ylist_pack_re = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist_pack_re"),32,idxu_half_max,nelements,(natom+32-1)/32);
|
||||
ylist_pack_im = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist_pack_im"),32,idxu_half_max,nelements,(natom+32-1)/32);
|
||||
dulist = t_sna_4c3_ll(Kokkos::ViewAllocateWithoutInitializing("sna:dulist"),1,1,1);
|
||||
} else {
|
||||
#endif
|
||||
ulisttot = t_sna_3c_ll("sna:ulisttot",idxu_max,nelements,natom);
|
||||
ulisttot_re = t_sna_3d_ll("sna:ulisttot_re",1,1,1);
|
||||
ulisttot_im = t_sna_3d_ll("sna:ulisttot_im",1,1,1);
|
||||
ulisttot_pack = t_sna_4c_ll("sna:ulisttot_pack",1,1,1,1);
|
||||
ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax);
|
||||
zlist = t_sna_3c_ll("sna:zlist",idxz_max,ndoubles,natom);
|
||||
zlist_pack = t_sna_4c_ll("sna:zlist_pack",1,1,1,1);
|
||||
blist = t_sna_3d_ll("sna:blist",idxb_max,ntriples,natom);
|
||||
blist_pack = t_sna_4d_ll("sna:blist_pack",1,1,1,1);
|
||||
ylist = t_sna_3c_ll("sna:ylist",idxu_max,nelements,natom);
|
||||
ylist_pack_re = t_sna_4d_ll("sna:ylist_pack_re",1,1,1,1);
|
||||
ylist_pack_im = t_sna_4d_ll("sna:ylist_pack_im",1,1,1,1);
|
||||
dulist = t_sna_4c3_ll("sna:dulist",idxu_max,natom,nmax);
|
||||
rij = t_sna_3d(Kokkos::ViewAllocateWithoutInitializing("sna:rij"),natom,nmax,3);
|
||||
wj = t_sna_2d(Kokkos::ViewAllocateWithoutInitializing("sna:wj"),natom,nmax);
|
||||
rcutij = t_sna_2d(Kokkos::ViewAllocateWithoutInitializing("sna:rcutij"),natom,nmax);
|
||||
ulisttot = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot"),idxu_half_max,nelements,natom);
|
||||
ulisttot_full = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_full"),idxu_max,nelements,natom);
|
||||
ulisttot_re = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_re"),1,1,1);
|
||||
ulisttot_im = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_im"),1,1,1);
|
||||
ulisttot_pack = t_sna_4c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulisttot_pack"),1,1,1,1);
|
||||
ulist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ulist"),idxu_cache_max,natom,nmax);
|
||||
zlist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:zlist"),idxz_max,ndoubles,natom);
|
||||
zlist_pack = t_sna_4c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:zlist_pack"),1,1,1,1);
|
||||
blist = t_sna_3d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:blist"),idxb_max,ntriples,natom);
|
||||
blist_pack = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:blist_pack"),1,1,1,1);
|
||||
ylist = t_sna_3c_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist"),idxu_half_max,nelements,natom);
|
||||
ylist_pack_re = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist_pack_re"),1,1,1,1);
|
||||
ylist_pack_im = t_sna_4d_ll(Kokkos::ViewAllocateWithoutInitializing("sna:ylist_pack_im"),1,1,1,1);
|
||||
dulist = t_sna_4c3_ll(Kokkos::ViewAllocateWithoutInitializing("sna:dulist"),idxu_cache_max,natom,nmax);
|
||||
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
}
|
||||
|
@ -278,20 +317,111 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
|
|||
* GPU routines
|
||||
* ----------------------------------------------------------------------*/
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Precompute the Cayley-Klein parameters and the derivatives thereof.
|
||||
This routine better exploits parallelism than the GPU ComputeUi and
|
||||
ComputeFusedDeidrj, which are one warp per atom-neighbor pair.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::compute_cayley_klein(const int& iatom, const int& jnbor, const double& x, const double& y,
|
||||
const double& z, const double& rcut, const double& wj_local)
|
||||
{
|
||||
//const double x = rij(iatom,jnbor,0);
|
||||
//const double y = rij(iatom,jnbor,1);
|
||||
//const double z = rij(iatom,jnbor,2);
|
||||
const double rsq = x * x + y * y + z * z;
|
||||
const double r = sqrt(rsq);
|
||||
//const double rcut = rcutij(iatom, jnbor);
|
||||
const double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
|
||||
const double theta0 = (r - rmin0) * rscale0;
|
||||
double sn, cs;
|
||||
sincos(theta0, &sn, &cs);
|
||||
const double z0 = r * cs / sn;
|
||||
const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
|
||||
|
||||
//const double wj_local = wj(iatom, jnbor);
|
||||
double sfac, dsfac;
|
||||
compute_s_dsfac(r, rcut, sfac, dsfac);
|
||||
sfac *= wj_local;
|
||||
dsfac *= wj_local;
|
||||
|
||||
const double rinv = 1.0 / r;
|
||||
const double ux = x * rinv;
|
||||
const double uy = y * rinv;
|
||||
const double uz = z * rinv;
|
||||
|
||||
const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
|
||||
|
||||
const SNAcomplex a = { z0 * r0inv, -z * r0inv };
|
||||
const SNAcomplex b = { r0inv * y, -r0inv * x };
|
||||
|
||||
const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
|
||||
|
||||
const double dr0invx = dr0invdr * ux;
|
||||
const double dr0invy = dr0invdr * uy;
|
||||
const double dr0invz = dr0invdr * uz;
|
||||
|
||||
const double dz0x = dz0dr * ux;
|
||||
const double dz0y = dz0dr * uy;
|
||||
const double dz0z = dz0dr * uz;
|
||||
|
||||
const SNAcomplex dax = { dz0x * r0inv + z0 * dr0invx, -z * dr0invx };
|
||||
const SNAcomplex day = { dz0y * r0inv + z0 * dr0invy, -z * dr0invy };
|
||||
const SNAcomplex daz = { dz0z * r0inv + z0 * dr0invz, -z * dr0invz - r0inv };
|
||||
|
||||
const SNAcomplex dbx = { y * dr0invx, -x * dr0invx - r0inv };
|
||||
const SNAcomplex dby = { y * dr0invy + r0inv, -x * dr0invy };
|
||||
const SNAcomplex dbz = { y * dr0invz, -x * dr0invz };
|
||||
|
||||
const double dsfacux = dsfac * ux;
|
||||
const double dsfacuy = dsfac * uy;
|
||||
const double dsfacuz = dsfac * uz;
|
||||
|
||||
CayleyKleinPack ckp{};
|
||||
ckp.a = a;
|
||||
ckp.b = b;
|
||||
ckp.da[0] = dax;
|
||||
ckp.db[0] = dbx;
|
||||
ckp.da[1] = day;
|
||||
ckp.db[1] = dby;
|
||||
ckp.da[2] = daz;
|
||||
ckp.db[2] = dbz;
|
||||
ckp.sfac = sfac;
|
||||
ckp.dsfacu[0] = dsfacux;
|
||||
ckp.dsfacu[1] = dsfacuy;
|
||||
ckp.dsfacu[2] = dsfacuz;
|
||||
|
||||
// Yes, this breaks the standard mantra of using SoA
|
||||
// instead of AoS, but it's net fine because of the
|
||||
// one warp per atom/neighbor pair for the recursive
|
||||
// polynomials. There's good L1 reuse, anyway.
|
||||
cayleyklein(iatom, jnbor) = ckp;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Initialize ulisttot with self-energy terms.
|
||||
Ulisttot uses a "half" data layout which takes
|
||||
advantage of the symmetry of the Wigner U matrices.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, const int& ielem)
|
||||
{
|
||||
for (int jelem = 0; jelem < nelements; jelem++) {
|
||||
for (int j = 0; j <= twojmax; j++) {
|
||||
const int jju = idxu_block(j);
|
||||
const int jju_half = idxu_half_block(j);
|
||||
|
||||
// Only diagonal elements get initialized
|
||||
// for (int m = 0; m < (j+1)*(j+1); m++)
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)),
|
||||
// Top half only: gets symmetrized by TransformUi
|
||||
// for (int m = 0; m < (j+1)*(j/2+1); m++)
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)),
|
||||
[&] (const int m) {
|
||||
|
||||
const int jjup = jju + m;
|
||||
const int jjup = jju_half + m;
|
||||
|
||||
// if m is on the "diagonal", initialize it with the self energy.
|
||||
// Otherwise zero it out
|
||||
|
@ -323,40 +453,27 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
|
|||
// utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
|
||||
|
||||
// get shared memory offset
|
||||
const int max_m_tile = (twojmax+1)*(twojmax+1);
|
||||
const int max_m_tile = (twojmax+1)*(twojmax/2+1);
|
||||
const int team_rank = team.team_rank();
|
||||
const int scratch_shift = team_rank * max_m_tile;
|
||||
|
||||
// double buffer
|
||||
// shared memory double buffer
|
||||
// Each level stores a (j+1) x ((j+3)/2) tile in preparation
|
||||
// for the computation on the next level. This is the "cached"
|
||||
// data layout, which also gets used on the CPU.
|
||||
SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
|
||||
SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
|
||||
|
||||
const double x = rij(iatom,jnbor,0);
|
||||
const double y = rij(iatom,jnbor,1);
|
||||
const double z = rij(iatom,jnbor,2);
|
||||
// Each warp is accessing the same pack: take advantage of
|
||||
// L1 reuse
|
||||
const CayleyKleinPack& ckp = cayleyklein(iatom, jnbor);
|
||||
|
||||
const SNAcomplex a = ckp.a;
|
||||
const SNAcomplex b = ckp.b;
|
||||
const double sfac = ckp.sfac;
|
||||
|
||||
const double wj_local = wj(iatom, jnbor);
|
||||
const double rcut = rcutij(iatom, jnbor);
|
||||
const int jelem = element(iatom, jnbor);
|
||||
|
||||
const double rsq = x * x + y * y + z * z;
|
||||
const double r = sqrt(rsq);
|
||||
|
||||
const double theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
|
||||
// theta0 = (r - rmin0) * rscale0;
|
||||
const double cs = cos(theta0);
|
||||
const double sn = sin(theta0);
|
||||
const double z0 = r * cs / sn; // r / tan(theta0)
|
||||
|
||||
// Compute cutoff function
|
||||
const double sfac = compute_sfac(r, rcut) * wj_local;
|
||||
|
||||
// compute Cayley-Klein parameters for unit quaternion,
|
||||
// pack into complex number
|
||||
const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
|
||||
const SNAcomplex a = { r0inv * z0, -r0inv * z };
|
||||
const SNAcomplex b = { r0inv * y, -r0inv * x };
|
||||
|
||||
// VMK Section 4.8.2
|
||||
|
||||
// All writes go to global memory and shared memory
|
||||
|
@ -367,7 +484,8 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
|
|||
});
|
||||
|
||||
for (int j = 1; j <= twojmax; j++) {
|
||||
const int jju = idxu_block[j];
|
||||
|
||||
const int jju = idxu_half_block[j];
|
||||
|
||||
// fill in left side of matrix layer from previous layer
|
||||
|
||||
|
@ -377,8 +495,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
|
|||
// for (int mb = 0; 2*mb <= j; mb++)
|
||||
const int n_mb = j/2+1;
|
||||
|
||||
// the last (j / 2) can be avoided due to symmetry
|
||||
const int total_iters = n_ma * n_mb - (j % 2 == 0 ? (j / 2) : 0);
|
||||
const int total_iters = n_ma * n_mb;
|
||||
|
||||
//for (int m = 0; m < total_iters; m++) {
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
|
||||
|
@ -423,25 +540,22 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
|
|||
// copy left side to right side with inversion symmetry VMK 4.4(2)
|
||||
// u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
|
||||
// if j is even (-> physical j integer), last element maps to self, skip
|
||||
//if (!(m == total_iters - 1 && j % 2 == 0)) {
|
||||
if (m < total_iters - 1 || j % 2 == 1) {
|
||||
const int sign_factor = (((ma+mb)%2==0)?1:-1);
|
||||
const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
|
||||
const int jjup_flip = jju + jju_shared_flip; // jju+(j+1-mb)*(j+1)-(ma+1);
|
||||
|
||||
const int sign_factor = (((ma+mb)%2==0)?1:-1);
|
||||
const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
|
||||
|
||||
if (sign_factor == 1) {
|
||||
u_accum.im = -u_accum.im;
|
||||
} else {
|
||||
u_accum.re = -u_accum.re;
|
||||
}
|
||||
|
||||
buf2[jju_shared_flip] = u_accum;
|
||||
|
||||
// split re, im to get fully coalesced atomic add
|
||||
Kokkos::atomic_add(&(ulisttot_re(jjup_flip,jelem,iatom)), sfac * u_accum.re);
|
||||
Kokkos::atomic_add(&(ulisttot_im(jjup_flip,jelem,iatom)), sfac * u_accum.im);
|
||||
if (sign_factor == 1) {
|
||||
u_accum.im = -u_accum.im;
|
||||
} else {
|
||||
u_accum.re = -u_accum.re;
|
||||
}
|
||||
|
||||
if (j%2==1 && mb+1==n_mb) {
|
||||
buf2[jju_shared_flip] = u_accum;
|
||||
}
|
||||
|
||||
// symmetric part of ulisttot is generated in TransformUi
|
||||
|
||||
});
|
||||
// In CUDA backend,
|
||||
// ThreadVectorRange has a __syncwarp (appropriately masked for
|
||||
|
@ -454,8 +568,11 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute Zi by summing over products of Ui, GPU version
|
||||
compute Zi by summing over products of Ui,
|
||||
AoSoA data layout to take advantage of coalescing, avoiding warp
|
||||
divergence. GPU version
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
|
@ -525,6 +642,8 @@ void SNAKokkos<DeviceType>::compute_zi(const int& iatom_mod, const int& jjz, con
|
|||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute Bi by summing conj(Ui)*Zi
|
||||
AoSoA data layout to take advantage of coalescing, avoiding warp
|
||||
divergence.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
|
@ -615,8 +734,9 @@ void SNAKokkos<DeviceType>::compute_bi(const int& iatom_mod, const int& jjb, con
|
|||
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute Yi from Ui without storing Zi, looping over zlist indices,
|
||||
GPU version
|
||||
compute Yi from Ui without storing Zi, looping over zlist indices.
|
||||
AoSoA data layout to take advantage of coalescing, avoiding warp
|
||||
divergence. GPU version.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
|
@ -635,7 +755,7 @@ void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
|
|||
const int mb2max = idxz(jjz, 6);
|
||||
const int na = idxz(jjz, 7);
|
||||
const int nb = idxz(jjz, 8);
|
||||
const int jju = idxz(jjz, 9);
|
||||
const int jju_half = idxz(jjz, 9);
|
||||
|
||||
const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
|
||||
//int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
|
||||
|
@ -677,8 +797,8 @@ void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
|
|||
} // end loop over ib
|
||||
|
||||
if (bnorm_flag) {
|
||||
ztmp_r /= j + 1;
|
||||
ztmp_i /= j + 1;
|
||||
ztmp_r /= (j + 1);
|
||||
ztmp_i /= (j + 1);
|
||||
}
|
||||
|
||||
// apply to z(j1,j2,j,ma,mb) to unique element of y(j)
|
||||
|
@ -710,8 +830,8 @@ void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
|
|||
betaj *= (j1 + 1) / (j + 1.0);
|
||||
|
||||
|
||||
Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju, elem3, iatom_div)), betaj*ztmp_r);
|
||||
Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju, elem3, iatom_div)), betaj*ztmp_i);
|
||||
Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj*ztmp_r);
|
||||
Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj*ztmp_i);
|
||||
} // end loop over elem3
|
||||
} // end loop over elem2
|
||||
} // end loop over elem1
|
||||
|
@ -719,7 +839,7 @@ void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
|
|||
|
||||
/* ----------------------------------------------------------------------
|
||||
Fused calculation of the derivative of Ui w.r.t. atom j
|
||||
and of dEidRj. GPU only.
|
||||
and accumulation into dEidRj. GPU only.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
|
@ -732,6 +852,9 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
|
|||
const int scratch_shift = team_rank * max_m_tile;
|
||||
const int jelem = element(iatom, jnbor);
|
||||
|
||||
// See notes on data layouts for shared memory caching
|
||||
// in `compute_ui`.
|
||||
|
||||
// double buffer for ulist
|
||||
SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
|
||||
SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
|
||||
|
@ -740,50 +863,20 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
|
|||
SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
|
||||
SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
|
||||
|
||||
const double x = rij(iatom,jnbor,0);
|
||||
const double y = rij(iatom,jnbor,1);
|
||||
const double z = rij(iatom,jnbor,2);
|
||||
const double rsq = x * x + y * y + z * z;
|
||||
const double r = sqrt(rsq);
|
||||
const double rcut = rcutij(iatom, jnbor);
|
||||
const double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
|
||||
const double theta0 = (r - rmin0) * rscale0;
|
||||
const double cs = cos(theta0);
|
||||
const double sn = sin(theta0);
|
||||
const double z0 = r * cs / sn;
|
||||
const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
|
||||
const CayleyKleinPack& ckp = cayleyklein(iatom, jnbor);
|
||||
|
||||
const double wj_local = wj(iatom, jnbor);
|
||||
const double sfac = wj_local * compute_sfac(r, rcut);
|
||||
const double dsfac = wj_local * compute_dsfac(r, rcut);
|
||||
|
||||
const double rinv = 1.0 / r;
|
||||
|
||||
// extract a single unit vector
|
||||
const double u = (dir == 0 ? x * rinv : dir == 1 ? y * rinv : z * rinv);
|
||||
|
||||
// Compute Cayley-Klein parameters for unit quaternion
|
||||
const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
|
||||
|
||||
const SNAcomplex a = { r0inv * z0, -r0inv * z };
|
||||
const SNAcomplex b = { r0inv * y, -r0inv * x };
|
||||
|
||||
const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
|
||||
const double dr0inv = dr0invdr * u;
|
||||
const double dz0 = dz0dr * u;
|
||||
|
||||
const SNAcomplex da = { dz0 * r0inv + z0 * dr0inv,
|
||||
- z * dr0inv + (dir == 2 ? - r0inv : 0.) };
|
||||
|
||||
const SNAcomplex db = { y * dr0inv + (dir==1?r0inv:0.),
|
||||
-x * dr0inv + (dir==0?-r0inv:0.) };
|
||||
const SNAcomplex a = ckp.a;
|
||||
const SNAcomplex b = ckp.b;
|
||||
const SNAcomplex da = ckp.da[dir];
|
||||
const SNAcomplex db = ckp.db[dir];
|
||||
const double sfac = ckp.sfac;
|
||||
const double dsfacu = ckp.dsfacu[dir]; // dsfac * u
|
||||
|
||||
// Accumulate the full contribution to dedr on the fly
|
||||
const double du_prod = dsfac * u; // chain rule
|
||||
const SNAcomplex y_local = ylist(0, jelem, iatom);
|
||||
|
||||
// Symmetry factor of 0.5 b/c 0 element is on diagonal for even j==0
|
||||
double dedr_full_sum = 0.5 * du_prod * y_local.re;
|
||||
double dedr_full_sum = 0.5 * dsfacu * y_local.re;
|
||||
|
||||
// single has a warp barrier at the end
|
||||
Kokkos::single(Kokkos::PerThread(team), [=]() {
|
||||
|
@ -793,8 +886,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
|
|||
});
|
||||
|
||||
for (int j = 1; j <= twojmax; j++) {
|
||||
int jju = idxu_block[j];
|
||||
int jjup = idxu_block[j-1];
|
||||
int jju_half = idxu_half_block[j];
|
||||
|
||||
// flatten the loop over ma,mb
|
||||
|
||||
|
@ -812,11 +904,10 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
|
|||
[&] (const int m, double& sum_tmp) {
|
||||
|
||||
// ma fast, mb slow
|
||||
// Equivalent to `int ma = m % n_ma; int mb = m / n_ma;` IF everything's positive.
|
||||
const int mb = m / n_ma;
|
||||
const int ma = m - mb * n_ma;
|
||||
|
||||
const int jju_index = jju+m;
|
||||
const int jju_half_index = jju_half+m;
|
||||
|
||||
// Load y_local, apply the symmetry scaling factor
|
||||
// The "secret" of the shared memory optimization is it eliminates
|
||||
|
@ -824,7 +915,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
|
|||
// shared memory and otherwise always writing, making the kernel
|
||||
// ultimately compute bound. We take advantage of that by adding
|
||||
// some reads back in.
|
||||
auto y_local = ylist(jju_index, jelem, iatom);
|
||||
auto y_local = ylist(jju_half_index, jelem, iatom);
|
||||
if (j % 2 == 0 && 2*mb == j) {
|
||||
if (ma == mb) { y_local = 0.5*y_local; }
|
||||
else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright
|
||||
|
@ -870,7 +961,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
|
|||
dulist_buf2[jju_shared_idx] = du_accum;
|
||||
|
||||
// Directly accumulate deidrj into sum_tmp
|
||||
const SNAcomplex du_prod = ((dsfac * u)*u_accum) + (sfac*du_accum);
|
||||
const SNAcomplex du_prod = (dsfacu * u_accum) + (sfac * du_accum);
|
||||
sum_tmp += du_prod.re * y_local.re + du_prod.im * y_local.im;
|
||||
|
||||
// copy left side to right side with inversion symmetry VMK 4.4(2)
|
||||
|
@ -923,8 +1014,9 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
|
|||
* ----------------------------------------------------------------------*/
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
* compute Ui by summing over neighbors j
|
||||
* ------------------------------------------------------------------------- */
|
||||
Ulisttot uses a "half" data layout which takes
|
||||
advantage of the symmetry of the Wigner U matrices.
|
||||
* ------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -932,11 +1024,11 @@ void SNAKokkos<DeviceType>::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceT
|
|||
{
|
||||
for (int jelem = 0; jelem < nelements; jelem++) {
|
||||
for (int j = 0; j <= twojmax; j++) {
|
||||
const int jju = idxu_block(j);
|
||||
const int jju = idxu_half_block(j);
|
||||
|
||||
// Only diagonal elements get initialized
|
||||
// for (int m = 0; m < (j+1)*(j+1); m++)
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)),
|
||||
// for (int m = 0; m < (j+1)*(j/2+1); m++)
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)),
|
||||
[&] (const int m) {
|
||||
|
||||
const int jjup = jju + m;
|
||||
|
@ -956,6 +1048,8 @@ void SNAKokkos<DeviceType>::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceT
|
|||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute Ui by summing over bispectrum components. CPU only.
|
||||
See comments above compute_uarray_cpu and add_uarraytot for
|
||||
data layout comments.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
|
@ -1026,10 +1120,10 @@ void SNAKokkos<DeviceType>::compute_zi_cpu(const int& iter)
|
|||
int ma2 = ma2max;
|
||||
int icga = ma1min * (j2 + 1) + ma2max;
|
||||
for(int ia = 0; ia < na; ia++) {
|
||||
suma1_r += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).re -
|
||||
ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).im);
|
||||
suma1_i += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).im +
|
||||
ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).re);
|
||||
suma1_r += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).re -
|
||||
ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).im);
|
||||
suma1_i += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).im +
|
||||
ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).re);
|
||||
ma1++;
|
||||
ma2--;
|
||||
icga += j2;
|
||||
|
@ -1098,8 +1192,8 @@ void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<Dev
|
|||
const int jjz_index = jjz + mb * (j + 1) + ma;
|
||||
if (2*mb == j) return;
|
||||
sum +=
|
||||
ulisttot(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
|
||||
ulisttot(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
|
||||
ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
|
||||
ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
|
||||
},sumzu_temp); // end loop over ma, mb
|
||||
sumzu += sumzu_temp;
|
||||
|
||||
|
@ -1113,8 +1207,8 @@ void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<Dev
|
|||
const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
|
||||
const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
|
||||
sum +=
|
||||
ulisttot(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
|
||||
ulisttot(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
|
||||
ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
|
||||
ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
|
||||
},sumzu_temp); // end loop over ma
|
||||
sumzu += sumzu_temp;
|
||||
|
||||
|
@ -1122,8 +1216,8 @@ void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<Dev
|
|||
const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
|
||||
const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
|
||||
sumzu += 0.5*
|
||||
(ulisttot(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
|
||||
ulisttot(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im);
|
||||
(ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
|
||||
ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im);
|
||||
} // end if jeven
|
||||
|
||||
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
||||
|
@ -1154,17 +1248,6 @@ void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<Dev
|
|||
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute Yi from Ui without storing Zi, looping over zlist indices
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::zero_yi_cpu(const int& idx, const int& iatom, const int& ielem)
|
||||
{
|
||||
ylist(idx,ielem,iatom) = {0.0, 0.0};
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute Yi from Ui without storing Zi, looping over zlist indices,
|
||||
CPU version
|
||||
|
@ -1188,7 +1271,7 @@ void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,
|
|||
const int mb2max = idxz(jjz, 6);
|
||||
const int na = idxz(jjz, 7);
|
||||
const int nb = idxz(jjz, 8);
|
||||
const int jju = idxz(jjz, 9);
|
||||
const int jju_half = idxz(jjz, 9);
|
||||
|
||||
const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
|
||||
//int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
|
||||
|
@ -1214,10 +1297,10 @@ void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,
|
|||
int icga = ma1min*(j2+1) + ma2max;
|
||||
|
||||
for (int ia = 0; ia < na; ia++) {
|
||||
suma1_r += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).re -
|
||||
ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).im);
|
||||
suma1_i += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).im +
|
||||
ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).re);
|
||||
suma1_r += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).re -
|
||||
ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).im);
|
||||
suma1_i += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).im +
|
||||
ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).re);
|
||||
ma1++;
|
||||
ma2--;
|
||||
icga += j2;
|
||||
|
@ -1264,8 +1347,8 @@ void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,
|
|||
if (!bnorm_flag && j1 > j)
|
||||
betaj *= (j1 + 1) / (j + 1.0);
|
||||
|
||||
Kokkos::atomic_add(&(ylist(jju, elem3, iatom).re), betaj*ztmp_r);
|
||||
Kokkos::atomic_add(&(ylist(jju, elem3, iatom).im), betaj*ztmp_i);
|
||||
Kokkos::atomic_add(&(ylist(jju_half, elem3, iatom).re), betaj*ztmp_r);
|
||||
Kokkos::atomic_add(&(ylist(jju_half, elem3, iatom).im), betaj*ztmp_i);
|
||||
} // end loop over elem3
|
||||
} // end loop over elem2
|
||||
} // end loop over elem1
|
||||
|
@ -1274,6 +1357,8 @@ void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,
|
|||
|
||||
/* ----------------------------------------------------------------------
|
||||
calculate derivative of Ui w.r.t. atom j
|
||||
see comments above compute_duarray_cpu for comments on the
|
||||
data layout
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
|
@ -1301,6 +1386,10 @@ void SNAKokkos<DeviceType>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy
|
|||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute dEidRj, CPU path only.
|
||||
dulist takes advantage of a `cached` data layout, similar to the
|
||||
shared memory layout for the GPU routines, which is efficient for
|
||||
compressing the calculation in compute_duarray_cpu. That said,
|
||||
dulist only uses the "half" data layout part of that structure.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
|
||||
|
@ -1314,14 +1403,18 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy
|
|||
//for(int j = 0; j <= twojmax; j++) {
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1),
|
||||
[&] (const int& j, t_scalar3<double>& sum_tmp) {
|
||||
int jju = idxu_block[j];
|
||||
int jju_half = idxu_half_block[j];
|
||||
int jju_cache = idxu_cache_block[j];
|
||||
|
||||
for(int mb = 0; 2*mb < j; mb++)
|
||||
for(int ma = 0; ma <= j; ma++) {
|
||||
sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im;
|
||||
sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im;
|
||||
sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im;
|
||||
jju++;
|
||||
sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re +
|
||||
dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im;
|
||||
sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re +
|
||||
dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im;
|
||||
sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re +
|
||||
dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im;
|
||||
jju_half++; jju_cache++;
|
||||
} //end loop over ma mb
|
||||
|
||||
// For j even, handle middle column
|
||||
|
@ -1330,16 +1423,22 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy
|
|||
|
||||
int mb = j/2;
|
||||
for(int ma = 0; ma < mb; ma++) {
|
||||
sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im;
|
||||
sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im;
|
||||
sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im;
|
||||
jju++;
|
||||
sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re +
|
||||
dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im;
|
||||
sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re +
|
||||
dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im;
|
||||
sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re +
|
||||
dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im;
|
||||
jju_half++; jju_cache++;
|
||||
}
|
||||
|
||||
//int ma = mb;
|
||||
sum_tmp.x += (dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im)*0.5;
|
||||
sum_tmp.y += (dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im)*0.5;
|
||||
sum_tmp.z += (dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im)*0.5;
|
||||
sum_tmp.x += (dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re +
|
||||
dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im)*0.5;
|
||||
sum_tmp.y += (dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re +
|
||||
dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im)*0.5;
|
||||
sum_tmp.z += (dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re +
|
||||
dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im)*0.5;
|
||||
} // end if jeven
|
||||
|
||||
},final_sum); // end loop over j
|
||||
|
@ -1355,6 +1454,10 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy
|
|||
|
||||
/* ----------------------------------------------------------------------
|
||||
add Wigner U-functions for one neighbor to the total
|
||||
ulist is in a "cached" data layout, which is a compressed layout
|
||||
which still keeps the recursive calculation simple. On the other hand
|
||||
`ulisttot` uses a "half" data layout, which fully takes advantage
|
||||
of the symmetry of the Wigner U matrices.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
|
@ -1364,15 +1467,27 @@ void SNAKokkos<DeviceType>::add_uarraytot(const typename Kokkos::TeamPolicy<Devi
|
|||
{
|
||||
const double sfac = compute_sfac(r, rcut) * wj;
|
||||
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(0)),
|
||||
[&] (const int& i) {
|
||||
Kokkos::atomic_add(&(ulisttot(i,jelem,iatom).re), sfac * ulist(i,iatom,jnbor).re);
|
||||
Kokkos::atomic_add(&(ulisttot(i,jelem,iatom).im), sfac * ulist(i,iatom,jnbor).im);
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1),
|
||||
[&] (const int& j) {
|
||||
|
||||
int jju_half = idxu_half_block[j]; // index into ulisttot
|
||||
int jju_cache = idxu_cache_block[j]; // index into ulist
|
||||
|
||||
int count = 0;
|
||||
for (int mb = 0; 2*mb <= j; mb++) {
|
||||
for (int ma = 0; ma <= j; ma++) {
|
||||
Kokkos::atomic_add(&(ulisttot(jju_half+count, jelem, iatom).re), sfac * ulist(jju_cache+count, iatom, jnbor).re);
|
||||
Kokkos::atomic_add(&(ulisttot(jju_half+count, jelem, iatom).im), sfac * ulist(jju_cache+count, iatom, jnbor).im);
|
||||
count++;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute Wigner U-functions for one neighbor
|
||||
compute Wigner U-functions for one neighbor.
|
||||
`ulisttot` uses a "cached" data layout, matching the amount of
|
||||
information stored between layers via scratch memory on the GPU path
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
|
@ -1399,8 +1514,8 @@ void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy
|
|||
ulist(0,iatom,jnbor).im = 0.0;
|
||||
|
||||
for (int j = 1; j <= twojmax; j++) {
|
||||
int jju = idxu_block[j];
|
||||
int jjup = idxu_block[j-1];
|
||||
const int jju = idxu_cache_block[j];
|
||||
const int jjup = idxu_cache_block[j-1];
|
||||
|
||||
// fill in left side of matrix layer from previous layer
|
||||
|
||||
|
@ -1434,37 +1549,37 @@ void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy
|
|||
(b_r * ulist(jjup_index,iatom,jnbor).im -
|
||||
b_i * ulist(jjup_index,iatom,jnbor).re);
|
||||
}
|
||||
});
|
||||
|
||||
// copy left side to right side with inversion symmetry VMK 4.4(2)
|
||||
// u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
|
||||
// copy left side to right side with inversion symmetry VMK 4.4(2)
|
||||
// u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
|
||||
|
||||
jju = idxu_block[j];
|
||||
jjup = jju+(j+1)*(j+1)-1;
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
|
||||
[&] (const int& mb) {
|
||||
// for (int mb = 0; 2*mb <= j; mb++) {
|
||||
int mbpar = (mb)%2==0?1:-1;
|
||||
int mapar = mbpar;
|
||||
for (int ma = 0; ma <= j; ma++) {
|
||||
const int jju_index = jju+mb*(j+1)+ma;
|
||||
const int jjup_index = jjup-mb*(j+1)-ma;
|
||||
if (mapar == 1) {
|
||||
ulist(jjup_index,iatom,jnbor).re = ulist(jju_index,iatom,jnbor).re;
|
||||
ulist(jjup_index,iatom,jnbor).im = -ulist(jju_index,iatom,jnbor).im;
|
||||
} else {
|
||||
ulist(jjup_index,iatom,jnbor).re = -ulist(jju_index,iatom,jnbor).re;
|
||||
ulist(jjup_index,iatom,jnbor).im = ulist(jju_index,iatom,jnbor).im;
|
||||
// Only need to add one symmetrized row for convenience
|
||||
// Symmetry gets "unfolded" in accumulating ulisttot
|
||||
if (j%2==1 && mb==(j/2)) {
|
||||
const int mbpar = (mb)%2==0?1:-1;
|
||||
int mapar = mbpar;
|
||||
for (int ma = 0; ma <= j; ma++) {
|
||||
const int jju_index = jju + mb*(j+1) + ma;
|
||||
const int jjup_index = jju + (j+1-mb)*(j+1)-(ma+1);
|
||||
if (mapar == 1) {
|
||||
ulist(jjup_index,iatom,jnbor).re = ulist(jju_index,iatom,jnbor).re;
|
||||
ulist(jjup_index,iatom,jnbor).im = -ulist(jju_index,iatom,jnbor).im;
|
||||
} else {
|
||||
ulist(jjup_index,iatom,jnbor).re = -ulist(jju_index,iatom,jnbor).re;
|
||||
ulist(jjup_index,iatom,jnbor).im = ulist(jju_index,iatom,jnbor).im;
|
||||
}
|
||||
mapar = -mapar;
|
||||
}
|
||||
mapar = -mapar;
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute derivatives of Wigner U-functions for one neighbor
|
||||
see comments in compute_uarray_cpu()
|
||||
Uses same cached data layout of ulist
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
|
@ -1524,8 +1639,8 @@ double r0inv;
|
|||
dulist(0,iatom,jnbor,2).im = 0.0;
|
||||
|
||||
for (int j = 1; j <= twojmax; j++) {
|
||||
int jju = idxu_block[j];
|
||||
int jjup = idxu_block[j-1];
|
||||
int jju = idxu_cache_block[j];
|
||||
int jjup = idxu_cache_block[j-1];
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
|
||||
[&] (const int& mb) {
|
||||
//for (int mb = 0; 2*mb <= j; mb++) {
|
||||
|
@ -1568,33 +1683,32 @@ double r0inv;
|
|||
b_i * dulist(jjup_index,iatom,jnbor,k).re);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// copy left side to right side with inversion symmetry VMK 4.4(2)
|
||||
// u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
|
||||
// Only need to add one symmetrized row for convenience
|
||||
// Symmetry gets "unfolded" during the dedr accumulation
|
||||
|
||||
jju = idxu_block[j];
|
||||
jjup = jju+(j+1)*(j+1)-1;
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
|
||||
[&] (const int& mb) {
|
||||
// for (int mb = 0; 2*mb <= j; mb++) {
|
||||
int mbpar = (mb)%2==0?1:-1;
|
||||
int mapar = mbpar;
|
||||
for (int ma = 0; ma <= j; ma++) {
|
||||
const int jju_index = jju+mb*(j+1)+ma;
|
||||
const int jjup_index = jjup-mb*(j+1)-ma;
|
||||
if (mapar == 1) {
|
||||
for (int k = 0; k < 3; k++) {
|
||||
dulist(jjup_index,iatom,jnbor,k).re = dulist(jju_index,iatom,jnbor,k).re;
|
||||
dulist(jjup_index,iatom,jnbor,k).im = -dulist(jju_index,iatom,jnbor,k).im;
|
||||
}
|
||||
} else {
|
||||
for (int k = 0; k < 3; k++) {
|
||||
dulist(jjup_index,iatom,jnbor,k).re = -dulist(jju_index,iatom,jnbor,k).re;
|
||||
dulist(jjup_index,iatom,jnbor,k).im = dulist(jju_index,iatom,jnbor,k).im;
|
||||
// copy left side to right side with inversion symmetry VMK 4.4(2)
|
||||
// u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
|
||||
|
||||
if (j%2==1 && mb==(j/2)) {
|
||||
const int mbpar = (mb)%2==0?1:-1;
|
||||
int mapar = mbpar;
|
||||
for (int ma = 0; ma <= j; ma++) {
|
||||
const int jju_index = jju+mb*(j+1)+ma;
|
||||
const int jjup_index = jju+(mb+2)*(j+1)-(ma+1);
|
||||
if (mapar == 1) {
|
||||
for (int k = 0; k < 3; k++) {
|
||||
dulist(jjup_index,iatom,jnbor,k).re = dulist(jju_index,iatom,jnbor,k).re;
|
||||
dulist(jjup_index,iatom,jnbor,k).im = -dulist(jju_index,iatom,jnbor,k).im;
|
||||
}
|
||||
} else {
|
||||
for (int k = 0; k < 3; k++) {
|
||||
dulist(jjup_index,iatom,jnbor,k).re = -dulist(jju_index,iatom,jnbor,k).re;
|
||||
dulist(jjup_index,iatom,jnbor,k).im = dulist(jju_index,iatom,jnbor,k).im;
|
||||
}
|
||||
}
|
||||
mapar = -mapar;
|
||||
}
|
||||
mapar = -mapar;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -1605,8 +1719,11 @@ double r0inv;
|
|||
sfac *= wj;
|
||||
dsfac *= wj;
|
||||
|
||||
// Even though we fill out a full "cached" data layout above,
|
||||
// we only need the "half" data for the accumulation into dedr.
|
||||
// Thus we skip updating any unnecessary data.
|
||||
for (int j = 0; j <= twojmax; j++) {
|
||||
int jju = idxu_block[j];
|
||||
int jju = idxu_cache_block[j];
|
||||
for (int mb = 0; 2*mb <= j; mb++)
|
||||
for (int ma = 0; ma <= j; ma++) {
|
||||
dulist(jju,iatom,jnbor,0).re = dsfac * ulist(jju,iatom,jnbor).re * ux +
|
||||
|
@ -1980,6 +2097,24 @@ double SNAKokkos<DeviceType>::compute_dsfac(double r, double rcut)
|
|||
return 0.0;
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::compute_s_dsfac(const double r, const double rcut, double& sfac, double& dsfac) {
|
||||
if (switch_flag == 0) { sfac = 0.; dsfac = 0.; }
|
||||
else if (switch_flag == 1) {
|
||||
if (r <= rmin0) { sfac = 1.0; dsfac = 0.0; }
|
||||
else if (r > rcut) { sfac = 0.; dsfac = 0.; }
|
||||
else {
|
||||
const double rcutfac = MY_PI / (rcut - rmin0);
|
||||
double sn, cs;
|
||||
sincos((r - rmin0) * rcutfac, &sn, &cs);
|
||||
sfac = 0.5 * (cs + 1.0);
|
||||
dsfac = -0.5 * sn * rcutfac;
|
||||
|
||||
}
|
||||
} else { sfac = 0.; dsfac = 0.; }
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
// efficient complex FMA (i.e., y += a x)
|
||||
|
@ -2031,30 +2166,31 @@ double SNAKokkos<DeviceType>::memory_usage()
|
|||
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
|
||||
int natom_pad = ((natom + 32 - 1) / 32) * 32; // for AoSoA layouts
|
||||
|
||||
auto natom_pad = (natom+32-1)/32;
|
||||
|
||||
bytes += natom * idxu_max * nelements * sizeof(double); // ulisttot_re
|
||||
bytes += natom * idxu_max * nelements * sizeof(double); // ulisttot_im
|
||||
bytes += natom * idxu_half_max * nelements * sizeof(double); // ulisttot_re
|
||||
bytes += natom * idxu_half_max * nelements * sizeof(double); // ulisttot_im
|
||||
bytes += natom_pad * idxu_max * nelements * sizeof(double) * 2; // ulisttot_pack
|
||||
|
||||
bytes += natom_pad * idxz_max * ndoubles * sizeof(double) * 2; // zlist_pack
|
||||
bytes += natom_pad * idxb_max * ntriples * sizeof(double); // blist_pack
|
||||
|
||||
bytes += natom_pad * idxu_max * nelements * sizeof(double); // ylist_pack_re
|
||||
bytes += natom_pad * idxu_max * nelements * sizeof(double); // ylist_pack_im
|
||||
bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ylist
|
||||
bytes += natom_pad * idxu_half_max * nelements * sizeof(double); // ylist_pack_re
|
||||
bytes += natom_pad * idxu_half_max * nelements * sizeof(double); // ylist_pack_im
|
||||
bytes += natom * idxu_half_max * nelements * sizeof(double) * 2; // ylist
|
||||
} else {
|
||||
#endif
|
||||
|
||||
bytes += natom * nmax * idxu_max * sizeof(double) * 2; // ulist
|
||||
bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ulisttot
|
||||
bytes += natom * nmax * idxu_cache_max * sizeof(double) * 2; // ulist
|
||||
bytes += natom * idxu_half_max * nelements * sizeof(double) * 2; // ulisttot
|
||||
|
||||
bytes += natom * idxz_max * ndoubles * sizeof(double) * 2; // zlist
|
||||
bytes += natom * idxb_max * ntriples * sizeof(double); // blist
|
||||
|
||||
bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ylist
|
||||
bytes += natom * idxu_half_max * nelements * sizeof(double) * 2; // ylist
|
||||
|
||||
bytes += natom * nmax * idxu_max * 3 * sizeof(double) * 2; // dulist
|
||||
bytes += natom * nmax * idxu_cache_max * 3 * sizeof(double) * 2; // dulist
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
}
|
||||
#endif
|
||||
|
@ -2063,6 +2199,8 @@ double SNAKokkos<DeviceType>::memory_usage()
|
|||
|
||||
bytes += jdim * jdim * jdim * sizeof(int); // idxcg_block
|
||||
bytes += jdim * sizeof(int); // idxu_block
|
||||
bytes += jdim * sizeof(int); // idxu_half_block
|
||||
bytes += jdim * sizeof(int); // idxu_cache_block
|
||||
bytes += jdim * jdim * jdim * sizeof(int); // idxz_block
|
||||
bytes += jdim * jdim * jdim * sizeof(int); // idxb_block
|
||||
|
||||
|
|
Loading…
Reference in New Issue