git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15166 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2016-06-14 16:33:40 +00:00
parent ecffbbe531
commit 2dbcfdc70c
8 changed files with 893 additions and 8 deletions

View File

@ -6,6 +6,7 @@
W. Michael Brown (Intel) michael.w.brown at intel.com
Rodrigo Canales (RWTH Aachen University)
Markus H<>hnerbach (RWTH Aachen University)
Stan Moore (Sandia)
Ahmed E. Ismail (RWTH Aachen University)
Paolo Bientinesi (RWTH Aachen University)
Anupama Kurpad (Intel)

View File

@ -0,0 +1,295 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Stan Moore (Sandia)
------------------------------------------------------------------------- */
#include <math.h>
#include <stdlib.h>
#include "bond_fene_intel.h"
#include "atom.h"
#include "neighbor.h"
#include "domain.h"
#include "comm.h"
#include "force.h"
#include "memory.h"
#include "suffix.h"
#include "error.h"
using namespace LAMMPS_NS;
typedef struct { int a,b,t; } int3_t;
/* ---------------------------------------------------------------------- */
BondFENEIntel::BondFENEIntel(LAMMPS *lmp) : BondFENE(lmp)
{
suffix_flag |= Suffix::INTEL;
}
/* ---------------------------------------------------------------------- */
BondFENEIntel::~BondFENEIntel()
{
}
/* ---------------------------------------------------------------------- */
void BondFENEIntel::compute(int eflag, int vflag)
{
#ifdef _LMP_INTEL_OFFLOAD
if (_use_base) {
BondFENE::compute(eflag, vflag);
return;
}
#endif
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
force_const_single);
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
force_const_double);
else
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
force_const_single);
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void BondFENEIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc)
{
if (eflag || vflag) ev_setup(eflag,vflag);
else evflag = 0;
if (evflag) {
if (eflag) {
if (force->newton_bond)
eval<1,1,1>(vflag, buffers, fc);
else
eval<1,1,0>(vflag, buffers, fc);
} else {
if (force->newton_bond)
eval<1,0,1>(vflag, buffers, fc);
else
eval<1,0,0>(vflag, buffers, fc);
}
} else {
if (force->newton_bond)
eval<0,0,1>(vflag, buffers, fc);
else
eval<0,0,0>(vflag, buffers, fc);
}
}
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void BondFENEIntel::eval(const int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc)
{
const int inum = neighbor->nbondlist;
if (inum == 0) return;
ATOM_T * _noalias const x = buffers->get_x(0);
const int nlocal = atom->nlocal;
const int nall = nlocal + atom->nghost;
int f_stride;
if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
else f_stride = buffers->get_stride(nlocal);
int tc;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
const int nthreads = tc;
acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5;
if (EVFLAG) {
if (EFLAG)
oebond = (acc_t)0.0;
if (vflag) {
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
}
}
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(f_start,f_stride,fc) \
reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
#endif
{
int nfrom, nto, tid;
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
FORCE_T * _noalias const f = f_start + (tid * f_stride);
if (fix->need_zero(tid))
memset(f, 0, f_stride * sizeof(FORCE_T));
const int3_t * _noalias const bondlist =
(int3_t *) neighbor->bondlist[0];
for (int n = nfrom; n < nto; n++) {
const int i1 = bondlist[n].a;
const int i2 = bondlist[n].b;
const int type = bondlist[n].t;
const flt_t ir0sq = fc.fc[type].ir0sq;
const flt_t k = fc.fc[type].k;
const flt_t sigma = fc.fc[type].sigma;
const flt_t sigmasq = sigma*sigma;
const flt_t epsilon = fc.fc[type].epsilon;
const flt_t delx = x[i1].x - x[i2].x;
const flt_t dely = x[i1].y - x[i2].y;
const flt_t delz = x[i1].z - x[i2].z;
const flt_t rsq = delx*delx + dely*dely + delz*delz;
flt_t rlogarg = (flt_t)1.0 - rsq * ir0sq;
flt_t irsq = (flt_t)1.0 / rsq;
// if r -> r0, then rlogarg < 0.0 which is an error
// issue a warning and reset rlogarg = epsilon
// if r > 2*r0 something serious is wrong, abort
if (rlogarg < (flt_t)0.1) {
char str[128];
sprintf(str,"FENE bond too long: " BIGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT " %g",
update->ntimestep,atom->tag[i1],atom->tag[i2],sqrt(rsq));
error->warning(FLERR,str,0);
if (rlogarg <= (flt_t)-3.0) error->one(FLERR,"Bad FENE bond");
rlogarg = (flt_t)0.1;
}
flt_t fbond = -k/rlogarg;
// force from LJ term
flt_t sr2,sr6;
if (rsq < (flt_t)TWO_1_3*sigmasq) {
sr2 = sigmasq * irsq;
sr6 = sr2 * sr2 * sr2;
fbond += (flt_t)48.0 * epsilon * sr6 * (sr6 - (flt_t)0.5) * irsq;
}
// energy
flt_t ebond;
if (EFLAG) {
ebond = (flt_t)-0.5 * k / ir0sq * log(rlogarg);
if (rsq < (flt_t)TWO_1_3 * sigmasq)
ebond += (flt_t)4.0 * epsilon * sr6 * (sr6 - (flt_t)1.0) + epsilon;
}
// apply force to each of 2 atoms
if (NEWTON_BOND || i1 < nlocal) {
f[i1].x += delx*fbond;
f[i1].y += dely*fbond;
f[i1].z += delz*fbond;
}
if (NEWTON_BOND || i2 < nlocal) {
f[i2].x -= delx*fbond;
f[i2].y -= dely*fbond;
f[i2].z -= delz*fbond;
}
if (EVFLAG) {
IP_PRE_ev_tally_bond(EFLAG, eatom, vflag, ebond, i1, i2, fbond,
delx, dely, delz, oebond, f, NEWTON_BOND,
nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
}
} // for n
} // omp parallel
if (EVFLAG) {
if (EFLAG)
energy += oebond;
if (vflag) {
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
}
}
fix->set_reduce_flag();
}
/* ---------------------------------------------------------------------- */
void BondFENEIntel::init_style()
{
BondFENE::init_style();
int ifix = modify->find_fix("package_intel");
if (ifix < 0)
error->all(FLERR,
"The 'package intel' command is required for /intel styles");
fix = static_cast<FixIntel *>(modify->fix[ifix]);
#ifdef _LMP_INTEL_OFFLOAD
_use_base = 0;
if (fix->offload_balance() != 0.0) {
_use_base = 1;
return;
}
#endif
fix->bond_init_check();
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
pack_force_const(force_const_single, fix->get_mixed_buffers());
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
pack_force_const(force_const_double, fix->get_double_buffers());
else
pack_force_const(force_const_single, fix->get_single_buffers());
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void BondFENEIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers)
{
const int bp1 = atom->nbondtypes + 1;
fc.set_ntypes(bp1,memory);
for (int i = 0; i < bp1; i++) {
fc.fc[i].k = k[i];
fc.fc[i].ir0sq = 1.0 / (r0[i] * r0[i]);
fc.fc[i].sigma = sigma[i];
fc.fc[i].epsilon = epsilon[i];
}
}
/* ---------------------------------------------------------------------- */
template <class flt_t>
void BondFENEIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
Memory *memory) {
if (nbondtypes != _nbondtypes) {
if (_nbondtypes > 0)
_memory->destroy(fc);
if (nbondtypes > 0)
_memory->create(fc,nbondtypes,"bondfeneintel.fc");
}
_nbondtypes = nbondtypes;
_memory = memory;
}

View File

@ -0,0 +1,88 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Stan Moore (Sandia)
------------------------------------------------------------------------- */
#ifdef BOND_CLASS
BondStyle(fene/intel,BondFENEIntel)
#else
#ifndef LMP_BOND_FENE_INTEL_H
#define LMP_BOND_FENE_INTEL_H
#include <stdio.h>
#include "bond_fene.h"
#include "fix_intel.h"
namespace LAMMPS_NS {
class BondFENEIntel : public BondFENE {
public:
BondFENEIntel(class LAMMPS *);
virtual ~BondFENEIntel();
virtual void compute(int, int);
virtual void init_style();
protected:
FixIntel *fix;
template <class flt_t> class ForceConst;
template <class flt_t, class acc_t>
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc);
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc);
template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers);
#ifdef _LMP_INTEL_OFFLOAD
int _use_base;
#endif
template <class flt_t>
class ForceConst {
public:
typedef struct { flt_t k, ir0sq, sigma, epsilon; } fc_packed1;
fc_packed1 *fc;
ForceConst() : _nbondtypes(0) {}
~ForceConst() { set_ntypes(0, NULL); }
void set_ntypes(const int nbondtypes, Memory *memory);
private:
int _nbondtypes;
Memory *_memory;
};
ForceConst<float> force_const_single;
ForceConst<double> force_const_double;
};
}
#endif
#endif
/* ERROR/WARNING messages:
E: Incorrect args for bond coefficients
Self-explanatory. Check the input script or data file.
*/

View File

@ -28,12 +28,22 @@
#include "update.h"
#include "error.h"
#ifdef LMP_USE_AVXCD
#if (__INTEL_COMPILER_BUILD_DATE > 20160414)
#define LMP_USE_AVXCD_DHC
#endif
#endif
#ifdef LMP_USE_AVXCD_DHC
#include "intel_simd.h"
using namespace ip_simd;
#endif
#include "suffix.h"
using namespace LAMMPS_NS;
#define PTOLERANCE (flt_t)1.05
#define MTOLERANCE (flt_t)-1.05
#define SMALL (flt_t)0.001
typedef struct { int a,b,c,d,t; } int5_t;
/* ---------------------------------------------------------------------- */
@ -102,6 +112,8 @@ void DihedralCharmmIntel::compute(int eflag, int vflag,
}
}
#ifndef LMP_USE_AVXCD_DHC
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void DihedralCharmmIntel::eval(const int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
@ -388,6 +400,8 @@ void DihedralCharmmIntel::eval(const int vflag,
f[i4].w += evdwl;
}
}
// IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
// delx, dely, delz);
if (vflag) {
spv0 += ev_pre * delx * delx * fpair;
spv1 += ev_pre * dely * dely * fpair;
@ -448,6 +462,442 @@ void DihedralCharmmIntel::eval(const int vflag,
fix->set_reduce_flag();
}
#else
/* ----------------------------------------------------------------------
Vector intrinsics are temporarily being used for the Stillinger-Weber
potential to allow for advanced features in the AVX512 instruction set to
be exploited on early hardware. We hope to see compiler improvements for
AVX512 that will eliminate this requirement, so it is not recommended to
develop code based on the intrinsics implementation. Please e-mail the
authors for more details.
------------------------------------------------------------------------- */
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void DihedralCharmmIntel::eval(const int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc)
{
typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t;
typedef typename SIMD_type<acc_t>::SIMD_vec SIMD_acc_t;
const int swidth = SIMD_type<flt_t>::width();
const int inum = neighbor->ndihedrallist;
if (inum == 0) return;
ATOM_T * _noalias const x = buffers->get_x(0);
flt_t * _noalias const q = buffers->get_q(0);
const int nlocal = atom->nlocal;
const int nall = nlocal + atom->nghost;
int f_stride;
if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
else f_stride = buffers->get_stride(nlocal);
int tc;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
const int nthreads = tc;
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
if (EVFLAG) {
if (EFLAG)
oevdwl = oecoul = oedihedral = (acc_t)0.0;
if (vflag) {
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
}
}
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(f_start,f_stride,fc) \
reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
opv0,opv1,opv2,opv3,opv4,opv5)
#endif
{
int nfrom, nto, tid;
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
FORCE_T * _noalias const f = f_start + (tid * f_stride);
if (fix->need_zero(tid))
memset(f, 0, f_stride * sizeof(FORCE_T));
const int * _noalias const dihedrallist =
(int *) neighbor->dihedrallist[0];
const flt_t * _noalias const weight = &(fc.weight[0]);
const flt_t * _noalias const x_f = &(x[0].x);
const flt_t * _noalias const cos_shift = &(fc.bp[0].cos_shift);
const flt_t * _noalias const sin_shift = &(fc.bp[0].sin_shift);
const flt_t * _noalias const k = &(fc.bp[0].k);
const int * _noalias const multiplicity = &(fc.bp[0].multiplicity);
const flt_t * _noalias const plj1 = &(fc.ljp[0][0].lj1);
const flt_t * _noalias const plj2 = &(fc.ljp[0][0].lj2);
const flt_t * _noalias const plj3 = &(fc.ljp[0][0].lj3);
const flt_t * _noalias const plj4 = &(fc.ljp[0][0].lj4);
acc_t * _noalias const pforce= &(f[0].x);
acc_t * _noalias const featom = &(f[0].w);
const flt_t qqrd2e = force->qqrd2e;
SIMD_acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
SIMD_acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
if (EVFLAG) {
if (EFLAG) {
sevdwl = SIMD_set((acc_t)0.0);
secoul = SIMD_set((acc_t)0.0);
sedihedral = SIMD_set((acc_t)0.0);
}
if (vflag) {
sv0 = SIMD_set((acc_t)0.0);
sv1 = SIMD_set((acc_t)0.0);
sv2 = SIMD_set((acc_t)0.0);
sv3 = SIMD_set((acc_t)0.0);
sv4 = SIMD_set((acc_t)0.0);
sv5 = SIMD_set((acc_t)0.0);
spv0 = SIMD_set((acc_t)0.0);
spv1 = SIMD_set((acc_t)0.0);
spv2 = SIMD_set((acc_t)0.0);
spv3 = SIMD_set((acc_t)0.0);
spv4 = SIMD_set((acc_t)0.0);
spv5 = SIMD_set((acc_t)0.0);
}
}
SIMD_int n_offset = SIMD_set(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50,
55, 60, 65, 70, 75) + (nfrom * 5);
const int nto5 = nto * 5;
const int nlocals4 = nlocal << 4;
const SIMD_int simd_nlocals4 = SIMD_set(nlocals4);
const int ntypes = atom->ntypes + 1;
for (int n = nfrom; n < nto; n += swidth) {
SIMD_mask nmask = n_offset < nto5;
SIMD_int i1 = SIMD_gather(nmask, dihedrallist, n_offset);
const SIMD_flt_t q1 = SIMD_gather(nmask, q, i1);
i1 = i1 << 4;
const SIMD_int i2 = SIMD_gather(nmask, dihedrallist+1, n_offset) << 4;
const SIMD_int i3 = SIMD_gather(nmask, dihedrallist+2, n_offset) << 4;
SIMD_int i4 = SIMD_gather(nmask, dihedrallist+3, n_offset);
const SIMD_flt_t q4 = SIMD_gather(nmask, q, i4);
i4 = i4 << 4;
SIMD_int type = SIMD_gather(nmask, dihedrallist+4, n_offset);
const SIMD_flt_t tweight = SIMD_gather(nmask, weight, type);
type = type << 2;
n_offset = n_offset + swidth * 5;
// 1st bond
SIMD_flt_t x1, x2, y1, y2, z1, z2;
SIMD_int itype;
SIMD_atom_gather(nmask, x_f, i1, x1, y1, z1, itype);
SIMD_atom_gather(nmask, x_f, i2, x2, y2, z2);
const SIMD_flt_t vb1x = x1 - x2;
const SIMD_flt_t vb1y = y1 - y2;
const SIMD_flt_t vb1z = z1 - z2;
// 2nd bond
SIMD_flt_t x3, y3, z3;
SIMD_atom_gather(nmask, x_f, i3, x3, y3, z3);
const SIMD_flt_t vb2xm = x2 - x3;
const SIMD_flt_t vb2ym = y2 - y3;
const SIMD_flt_t vb2zm = z2 - z3;
// 3rd bond
SIMD_flt_t x4, y4, z4;
SIMD_int jtype;
SIMD_atom_gather(nmask, x_f, i4, x4, y4, z4, jtype);
const SIMD_flt_t vb3x = x4 - x3;
const SIMD_flt_t vb3y = y4 - y3;
const SIMD_flt_t vb3z = z4 - z3;
// 1-4
const SIMD_flt_t delx = x1 - x4;
const SIMD_flt_t dely = y1 - y4;
const SIMD_flt_t delz = z1 - z4;
// c,s calculation
const SIMD_flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
const SIMD_flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
const SIMD_flt_t az = vb1x*vb2ym - vb1y*vb2xm;
const SIMD_flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
const SIMD_flt_t by = vb3z*vb2xm - vb3x*vb2zm;
const SIMD_flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
const SIMD_flt_t rasq = ax*ax + ay*ay + az*az;
const SIMD_flt_t rbsq = bx*bx + by*by + bz*bz;
const SIMD_flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
const SIMD_flt_t rg = SIMD_sqrt(rgsq);
const SIMD_flt_t szero = SIMD_set((flt_t)0.0);
const SIMD_flt_t rginv = SIMD_rcpz(rg > szero, rg);
const SIMD_flt_t ra2inv = SIMD_rcpz(rasq > szero, rasq);
const SIMD_flt_t rb2inv = SIMD_rcpz(rbsq > szero, rbsq);
const SIMD_flt_t rabinv = SIMD_sqrt(ra2inv*rb2inv);
SIMD_flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
const SIMD_flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
// error check
const SIMD_flt_t one = SIMD_set((flt_t)1.0);
const SIMD_flt_t mone = SIMD_set((flt_t)-1.0);
const SIMD_flt_t ptol = SIMD_set(PTOLERANCE);
const SIMD_flt_t ntol = SIMD_set(MTOLERANCE);
if (c > ptol || c < ntol)
if (screen)
error->warning(FLERR,"Dihedral problem.");
c = SIMD_set(c, c > one, one);
c = SIMD_set(c, c < mone, mone);
const SIMD_flt_t tcos_shift = SIMD_gather(nmask, cos_shift, type);
const SIMD_flt_t tsin_shift = SIMD_gather(nmask, sin_shift, type);
const SIMD_flt_t tk = SIMD_gather(nmask, k, type);
const SIMD_int m = SIMD_gather(nmask, multiplicity, type);
SIMD_flt_t p(one);
SIMD_flt_t ddf1(szero);
SIMD_flt_t df1(szero);
const int m_max = SIMD_max(m);
for (int i = 0; i < m_max; i++) {
const SIMD_mask my_m = i < m;
ddf1 = SIMD_set(ddf1, my_m, p*c - df1*s);
df1 = SIMD_set(df1, my_m, p*s + df1*c);
p = SIMD_set(p, my_m, ddf1);
}
SIMD_flt_t multf;
SIMD_cast(-m,multf);
p = p*tcos_shift + df1*tsin_shift;
df1 = df1*tcos_shift - ddf1*tsin_shift;
df1 = df1 * multf;
p = p + one;
SIMD_mask mzero = (m == SIMD_set((int)0));
p = SIMD_set(p, mzero, one + tcos_shift);
df1 = SIMD_set(df1, mzero, szero);
const SIMD_flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
const SIMD_flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
const SIMD_flt_t fga = fg*ra2inv*rginv;
const SIMD_flt_t hgb = hg*rb2inv*rginv;
const SIMD_flt_t gaa = -ra2inv*rg;
const SIMD_flt_t gbb = rb2inv*rg;
const SIMD_flt_t dtfx = gaa*ax;
const SIMD_flt_t dtfy = gaa*ay;
const SIMD_flt_t dtfz = gaa*az;
const SIMD_flt_t dtgx = fga*ax - hgb*bx;
const SIMD_flt_t dtgy = fga*ay - hgb*by;
const SIMD_flt_t dtgz = fga*az - hgb*bz;
const SIMD_flt_t dthx = gbb*bx;
const SIMD_flt_t dthy = gbb*by;
const SIMD_flt_t dthz = gbb*bz;
const SIMD_flt_t df = -tk * df1;
const SIMD_flt_t sx2 = df*dtgx;
const SIMD_flt_t sy2 = df*dtgy;
const SIMD_flt_t sz2 = df*dtgz;
SIMD_flt_t f1x = df*dtfx;
SIMD_flt_t f1y = df*dtfy;
SIMD_flt_t f1z = df*dtfz;
SIMD_flt_t f2x = sx2 - f1x;
SIMD_flt_t f2y = sy2 - f1y;
SIMD_flt_t f2z = sz2 - f1z;
SIMD_flt_t f4x = df*dthx;
SIMD_flt_t f4y = df*dthy;
SIMD_flt_t f4z = df*dthz;
SIMD_flt_t f3x = -sx2 - f4x;
SIMD_flt_t f3y = -sy2 - f4y;
SIMD_flt_t f3z = -sz2 - f4z;
SIMD_flt_t qdeng;
if (EVFLAG) {
SIMD_flt_t ev_pre;
if (NEWTON_BOND) ev_pre = one;
else {
ev_pre = szero;
const SIMD_flt_t quarter = SIMD_set((flt_t)0.25);
ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4, ev_pre, quarter);
ev_pre = SIMD_add(ev_pre, i2 < simd_nlocals4, ev_pre, quarter);
ev_pre = SIMD_add(ev_pre, i3 < simd_nlocals4, ev_pre, quarter);
ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4, ev_pre, quarter);
}
SIMD_zero_masked(nmask, ev_pre);
if (EFLAG) {
const SIMD_flt_t deng = tk * p;
sedihedral = SIMD_ev_add(sedihedral, ev_pre * deng);
if (eatom) {
qdeng = deng * SIMD_set((flt_t)0.25);
SIMD_mask newton_mask;
if (NEWTON_BOND) newton_mask = nmask;
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4);
SIMD_flt_t ieng = qdeng;
SIMD_jeng_update(newton_mask, featom, i2, ieng);
ieng = qdeng;
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4);
SIMD_jeng_update(newton_mask, featom, i3, ieng);
}
}
if (vflag) {
sv0 = SIMD_ev_add(sv0, ev_pre*(vb1x*f1x-vb2xm*f3x+(vb3x-vb2xm)*f4x));
sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y));
sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z));
sv3 = SIMD_ev_add(sv3, ev_pre*(vb1x*f1y-vb2xm*f3y+(vb3x-vb2xm)*f4y));
sv4 = SIMD_ev_add(sv4, ev_pre*(vb1x*f1z-vb2xm*f3z+(vb3x-vb2xm)*f4z));
sv5 = SIMD_ev_add(sv5, ev_pre*(vb1y*f1z-vb2ym*f3z+(vb3y-vb2ym)*f4z));
}
}
SIMD_mask newton_mask;
if (NEWTON_BOND) newton_mask = nmask;
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4);
SIMD_safe_jforce(newton_mask, pforce, i2, f2x, f2y, f2z);
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4);
SIMD_safe_jforce(newton_mask, pforce, i3, f3x, f3y, f3z);
// 1-4 LJ and Coulomb interactions
// tally energy/virial in pair, using newton_bond as newton flag
const SIMD_flt_t rsq = delx*delx + dely*dely + delz*delz;
const SIMD_flt_t r2inv = SIMD_rcpz(nmask, rsq);
const SIMD_flt_t r6inv = r2inv*r2inv*r2inv;
const SIMD_flt_t simd_qqrd2e = SIMD_set(qqrd2e);
SIMD_flt_t forcecoul;
if (implicit) forcecoul = simd_qqrd2e * q1 * q4 * r2inv;
else forcecoul = simd_qqrd2e * q1 * q4 * SIMD_sqrt(r2inv);
const SIMD_int ijtype = (itype * ntypes + jtype) << 2;
const SIMD_flt_t lj1 = SIMD_gather(nmask, plj1, ijtype);
const SIMD_flt_t lj2 = SIMD_gather(nmask, plj2, ijtype);
const SIMD_flt_t forcelj = r6inv * (lj1 * r6inv - lj2);
const SIMD_flt_t fpair = tweight * (forcelj + forcecoul) * r2inv;
f1x = f1x + delx * fpair;
f1y = f1y + dely * fpair;
f1z = f1z + delz * fpair;
f4x = f4x - delx * fpair;
f4y = f4y - dely * fpair;
f4z = f4z - delz * fpair;
if (EVFLAG) {
SIMD_flt_t ev_pre;
if (NEWTON_BOND) ev_pre = one;
else {
ev_pre = szero;
const SIMD_flt_t half = SIMD_set((flt_t)0.5);
ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4,ev_pre,half);
ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4,ev_pre,half);
}
SIMD_zero_masked(nmask, ev_pre);
if (EFLAG) {
const SIMD_flt_t ecoul = tweight * forcecoul;
const SIMD_flt_t lj3 = SIMD_gather(nmask, plj3, ijtype);
const SIMD_flt_t lj4 = SIMD_gather(nmask, plj4, ijtype);
SIMD_flt_t evdwl = tweight * r6inv * (lj3 * r6inv - lj4);
secoul = SIMD_ev_add(secoul, ev_pre * ecoul);
sevdwl = SIMD_ev_add(sevdwl, ev_pre * evdwl);
if (eatom) {
const SIMD_flt_t half = SIMD_set((flt_t)0.5);
evdwl = evdwl * half;
evdwl = evdwl + half * ecoul + qdeng;
if (NEWTON_BOND) newton_mask = nmask;
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i1, simd_nlocals4);
SIMD_flt_t ieng = evdwl;
SIMD_jeng_update(newton_mask, featom, i1, ieng);
ieng = evdwl;
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i4, simd_nlocals4);
SIMD_jeng_update(newton_mask, featom, i4, ieng);
}
}
if (vflag) {
spv0 = SIMD_ev_add(spv0, ev_pre * delx * delx * fpair);
spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair);
spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair);
spv3 = SIMD_ev_add(spv3, ev_pre * delx * dely * fpair);
spv4 = SIMD_ev_add(spv4, ev_pre * delx * delz * fpair);
spv5 = SIMD_ev_add(spv5, ev_pre * dely * delz * fpair);
}
}
if (NEWTON_BOND) newton_mask = nmask;
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i1, simd_nlocals4);
SIMD_safe_jforce(newton_mask, pforce, i1, f1x, f1y, f1z);
if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i4, simd_nlocals4);
SIMD_safe_jforce(newton_mask, pforce, i4, f4x, f4y, f4z);
} // for n
if (EVFLAG) {
if (EFLAG) {
oedihedral += SIMD_sum(sedihedral);
oecoul += SIMD_sum(secoul);
oevdwl += SIMD_sum(sevdwl);
}
if (vflag) {
ov0 += SIMD_sum(sv0);
ov1 += SIMD_sum(sv1);
ov2 += SIMD_sum(sv2);
ov3 += SIMD_sum(sv3);
ov4 += SIMD_sum(sv4);
ov5 += SIMD_sum(sv5);
opv0 += SIMD_sum(spv0);
opv1 += SIMD_sum(spv1);
opv2 += SIMD_sum(spv2);
opv3 += SIMD_sum(spv3);
opv4 += SIMD_sum(spv4);
opv5 += SIMD_sum(spv5);
}
}
} // omp parallel
if (EVFLAG) {
if (EFLAG) {
energy += oedihedral;
force->pair->eng_vdwl += oevdwl;
force->pair->eng_coul += oecoul;
}
if (vflag) {
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
force->pair->virial[0] += opv0;
force->pair->virial[1] += opv1;
force->pair->virial[2] += opv2;
force->pair->virial[3] += opv3;
force->pair->virial[4] += opv4;
force->pair->virial[5] += opv5;
}
}
fix->set_reduce_flag();
}
#endif
/* ---------------------------------------------------------------------- */
void DihedralCharmmIntel::init_style()

View File

@ -259,7 +259,12 @@ struct vector_ops<double, KNC> {
return _mm512_invsqrt_pd(a);
}
static fvec sincos(fvec *cos, const fvec &a) {
#if __INTEL_COMPILER+0 < 1500
*reinterpret_cast<__m512d *>(cos) = _mm512_cos_pd(a);
return _mm512_sin_pd(a);
#else
return _mm512_sincos_pd(reinterpret_cast<__m512d *>(cos), a);
#endif
}
static fscal reduce_add(const fvec &a) {
return _mm512_reduce_add_pd(a);
@ -392,7 +397,12 @@ struct vector_ops<float, KNC> {
return _mm512_invsqrt_ps(a);
}
static fvec sincos(fvec *cos, const fvec &a) {
#if __INTEL_COMPILER+0 < 1500
*reinterpret_cast<__m512 *>(cos) = _mm512_cos_ps(a);
return _mm512_sin_ps(a);
#else
return _mm512_sincos_ps(reinterpret_cast<__m512 *>(cos), a);
#endif
}
static fscal reduce_add(const fvec &a) {
return _mm512_reduce_add_ps(a);

View File

@ -37,10 +37,28 @@ authors for more details.
namespace ip_simd {
typedef __m512i SIMD_int;
typedef __mmask16 SIMD_mask;
typedef __m512 SIMD_float;
typedef __m512d SIMD_double;
struct SIMD_int {
__m512i v;
SIMD_int() {}
SIMD_int(const __m512i in) : v(in) {}
operator __m512i() const { return v;}
};
struct SIMD_float {
__m512 v;
SIMD_float() {}
SIMD_float(const __m512 in) : v(in) {}
operator __m512() const { return v;}
};
struct SIMD_double {
__m512d v;
SIMD_double() {}
SIMD_double(const __m512d in) : v(in) {}
operator __m512d() const { return v;}
};
template<class flt_t>
class SIMD_type {
@ -276,6 +294,18 @@ namespace ip_simd {
return _mm512_mask_sub_pd(s,m,one,two);
}
inline SIMD_int operator-(const SIMD_int &one) {
return _mm512_sub_epi32(SIMD_set((int)0),one);
}
inline SIMD_float operator-(const SIMD_float &one) {
return _mm512_sub_ps(SIMD_set((float)0),one);
}
inline SIMD_double operator-(const SIMD_double &one) {
return _mm512_sub_pd(SIMD_set((double)0),one);
}
inline SIMD_int operator-(const SIMD_int &one, const SIMD_int &two) {
return _mm512_sub_epi32(one,two);
}
@ -694,17 +724,17 @@ namespace ip_simd {
// -------- I/O operations
inline void SIMD_print(const SIMD_int &vec) {
inline void SIMD_print(const __m512i &vec) {
for (int i = 0; i < 16; i++)
printf("%d ",(*((int*)&(vec) + (i))));
}
inline void SIMD_print(const SIMD_float &vec) {
inline void SIMD_print(const __m512 &vec) {
for (int i = 0; i < 16; i++)
printf("%f ",(*((float*)&(vec) + (i))));
}
inline void SIMD_print(const SIMD_double &vec) {
inline void SIMD_print(const __m512d &vec) {
for (int i = 0; i < 8; i++)
printf("%f ",(*((double*)&(vec) + (i))));
}

View File

@ -1037,8 +1037,10 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
#if defined(LMP_SIMD_COMPILER)
#ifdef _LMP_INTEL_OFFLOAD
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
#if __INTEL_COMPILER+0 > 1499
#pragma vector aligned
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
#endif
#else
#pragma vector aligned
#pragma simd
@ -1645,8 +1647,10 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
#if defined(LMP_SIMD_COMPILER)
#ifdef _LMP_INTEL_OFFLOAD
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
#if __INTEL_COMPILER+0 > 1499
#pragma vector aligned
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
#endif
#else
#pragma vector aligned
#pragma simd
@ -2241,8 +2245,10 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
#if defined(LMP_SIMD_COMPILER)
#ifdef _LMP_INTEL_OFFLOAD
int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
#if __INTEL_COMPILER+0 > 1499
#pragma vector aligned
#pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
#endif
#else
#pragma vector aligned
#pragma simd

View File

@ -323,7 +323,12 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
lj4 = lj34i[jtype].lj4;
offset = ljc12oi[jtype].offset;
}
evdwl = r6inv * (lj3 * r6inv - lj4) - offset;
evdwl = r6inv * (lj3 * r6inv - lj4);
#ifdef INTEL_VMASK
evdwl -= offset;
#else
if (rsq < cutsq) evdwl -= offset;
#endif
if (!ONETYPE) evdwl *= factor_lj;
sevdwl += ev_pre*evdwl;
if (eatom) {