forked from lijiext/lammps
Adding pair style dpd/intel and dihedral style fourier/intel
Adding raw performance numbers for Skylake xeon server. Fixes for using older Intel compilers and compiling without OpenMP. Fix adding in hooks for using USER-INTEL w/ minimization.
This commit is contained in:
parent
f2c1172741
commit
cf24dd0265
Binary file not shown.
Before Width: | Height: | Size: 20 KiB After Width: | Height: | Size: 19 KiB |
|
@ -25,12 +25,12 @@ LAMMPS to run on the CPU cores and coprocessor cores simultaneously.
|
|||
[Currently Available USER-INTEL Styles:]
|
||||
|
||||
Angle Styles: charmm, harmonic :ulb,l
|
||||
Bond Styles: fene, harmonic :l
|
||||
Bond Styles: fene, fourier, harmonic :l
|
||||
Dihedral Styles: charmm, harmonic, opls :l
|
||||
Fixes: nve, npt, nvt, nvt/sllod :l
|
||||
Improper Styles: cvff, harmonic :l
|
||||
Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long,
|
||||
buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm,
|
||||
buck, dpd, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm,
|
||||
lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
|
||||
sw, tersoff :l
|
||||
K-Space Styles: pppm, pppm/disp :l
|
||||
|
@ -82,6 +82,10 @@ this order :l
|
|||
The {newton} setting applies to all atoms, not just atoms shared
|
||||
between MPI tasks :l
|
||||
Vectorization can change the order for adding pairwise forces :l
|
||||
Unless specified otherwise at build time, the random number
|
||||
generator for dissipative particle dynamics uses the Mersenne
|
||||
Twister generator (that should be more robust than the standard
|
||||
generator) :l
|
||||
:ule
|
||||
|
||||
The precision mode (described below) used with the USER-INTEL
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
:line
|
||||
|
||||
dihedral_style fourier command :h3
|
||||
dihedral_style fourier/intel command :h3
|
||||
dihedral_style fourier/omp command :h3
|
||||
|
||||
[Syntax:]
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
|
||||
pair_style dpd command :h3
|
||||
pair_style dpd/gpu command :h3
|
||||
pair_style dpd/intel command :h3
|
||||
pair_style dpd/omp command :h3
|
||||
pair_style dpd/tstat command :h3
|
||||
pair_style dpd/tstat/gpu command :h3
|
||||
|
|
|
@ -30,14 +30,15 @@ be added or changed in the Makefile depending on the version:
|
|||
|
||||
2017 update 2 - No changes needed
|
||||
2017 updates 3 or 4 - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
|
||||
2018 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high
|
||||
2018 inital release - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
|
||||
2018u1 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
When using the suffix command with "intel", intel styles will be used if they
|
||||
exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP
|
||||
USER-OMP styles will be used whenever USER-INTEL styles are not available. This
|
||||
allow for running most styles in LAMMPS with threading.
|
||||
is installed, USER-OMP styles will be used whenever USER-INTEL styles are not
|
||||
available. This allow for running most styles in LAMMPS with threading.
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
|
@ -52,6 +53,15 @@ need to be changed.
|
|||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
The random number generator for Dissipative Particle Dynamics (DPD) in the
|
||||
Intel package uses the Mersenne Twister pseudorandom number generator as
|
||||
implemented in the Intel Math Kernel Library (MKL). This generator is faster
|
||||
and more robust with a significantly longer period than the default DPD
|
||||
generator. However, if MKL is not installed, the standard random number
|
||||
generator can be used by adding the compile flag "-DLMP_NO_MKL_RNG".
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
In order to use offload to Intel(R) Xeon Phi(TM) coprocessors, the flag
|
||||
-DLMP_INTEL_OFFLOAD should be set in the Makefile. Offload requires the use of
|
||||
Intel compilers.
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
# in.intel.tersoff - Silicon benchmark with Tersoff
|
||||
# in.intel.water - Coarse-grain water benchmark using Stillinger-Weber
|
||||
# in.intel.airebo - Polyethelene benchmark with AIREBO
|
||||
# in.intel.dpd - Dissipative Particle Dynamics
|
||||
#
|
||||
#############################################################################
|
||||
|
||||
|
@ -16,16 +17,17 @@
|
|||
# Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017
|
||||
# - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi
|
||||
#
|
||||
# Xeon E5-2697v4 Xeon Phi 7250
|
||||
# Xeon E5-2697v4 Xeon Phi 7250 Xeon Gold 6148
|
||||
#
|
||||
# in.intel.lj - 199.5 282.3
|
||||
# in.intel.rhodo - 12.4 17.5
|
||||
# in.intel.lc - 19.0 25.7
|
||||
# in.intel.eam - 59.4 92.8
|
||||
# in.intel.sw - 132.4 161.9
|
||||
# in.intel.tersoff - 83.3 101.1
|
||||
# in.intel.water - 53.4 90.3
|
||||
# in.intel.airebo - 7.3 11.8
|
||||
# in.intel.lj - 199.5 282.3 317.3
|
||||
# in.intel.rhodo - 12.4 17.5 24.4
|
||||
# in.intel.lc - 19.0 25.7 26.8
|
||||
# in.intel.eam - 59.4 92.8 105.6
|
||||
# in.intel.sw - 132.4 161.9 213.8
|
||||
# in.intel.tersoff - 83.3 101.1 109.6
|
||||
# in.intel.water - 53.4 90.3 105.5
|
||||
# in.intel.airebo - 7.3 11.8 17.6
|
||||
# in.intel.dpd - 74.5 100.4 148.1
|
||||
#
|
||||
#############################################################################
|
||||
|
||||
|
|
|
@ -0,0 +1,48 @@
|
|||
# DPD benchmark
|
||||
|
||||
variable N index on # Newton Setting
|
||||
variable w index 10 # Warmup Timesteps
|
||||
variable t index 4000 # Main Run Timesteps
|
||||
variable m index 1 # Main Run Timestep Multiplier
|
||||
variable n index 0 # Use NUMA Mapping for Multi-Node
|
||||
variable p index 0 # Use Power Measurement
|
||||
|
||||
variable x index 4
|
||||
variable y index 2
|
||||
variable z index 2
|
||||
|
||||
variable xx equal 20*$x
|
||||
variable yy equal 20*$y
|
||||
variable zz equal 20*$z
|
||||
variable rr equal floor($t*$m)
|
||||
|
||||
newton $N
|
||||
if "$n > 0" then "processors * * * grid numa"
|
||||
|
||||
units lj
|
||||
atom_style atomic
|
||||
comm_modify mode single vel yes
|
||||
|
||||
lattice fcc 3.0
|
||||
region box block 0 ${xx} 0 ${yy} 0 ${zz}
|
||||
create_box 1 box
|
||||
create_atoms 1 box
|
||||
mass 1 1.0
|
||||
|
||||
velocity all create 1.0 87287 loop geom
|
||||
|
||||
pair_style dpd 1.0 1.0 928948
|
||||
pair_coeff 1 1 25.0 4.5
|
||||
|
||||
neighbor 0.5 bin
|
||||
neigh_modify delay 0 every 1
|
||||
|
||||
fix 1 all nve
|
||||
timestep 0.04
|
||||
|
||||
thermo 1000
|
||||
|
||||
if "$p > 0" then "run_style verlet/power"
|
||||
|
||||
if "$w > 0" then "run $w"
|
||||
run ${rr}
|
|
@ -0,0 +1,441 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <mpi.h>
|
||||
#include <math.h>
|
||||
#include "dihedral_fourier_intel.h"
|
||||
#include "atom.h"
|
||||
#include "comm.h"
|
||||
#include "memory.h"
|
||||
#include "neighbor.h"
|
||||
#include "domain.h"
|
||||
#include "force.h"
|
||||
#include "pair.h"
|
||||
#include "update.h"
|
||||
#include "error.h"
|
||||
|
||||
#include "suffix.h"
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
#define PTOLERANCE (flt_t)1.05
|
||||
#define MTOLERANCE (flt_t)-1.05
|
||||
typedef struct { int a,b,c,d,t; } int5_t;
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
DihedralFourierIntel::DihedralFourierIntel(class LAMMPS *lmp)
|
||||
: DihedralFourier(lmp)
|
||||
{
|
||||
suffix_flag |= Suffix::INTEL;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void DihedralFourierIntel::compute(int eflag, int vflag)
|
||||
{
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (_use_base) {
|
||||
DihedralFourier::compute(eflag, vflag);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
||||
force_const_single);
|
||||
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
||||
force_const_double);
|
||||
else
|
||||
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
|
||||
force_const_single);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void DihedralFourierIntel::compute(int eflag, int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
{
|
||||
if (eflag || vflag) {
|
||||
ev_setup(eflag,vflag);
|
||||
} else evflag = 0;
|
||||
|
||||
if (evflag) {
|
||||
if (vflag && !eflag) {
|
||||
if (force->newton_bond)
|
||||
eval<0,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,1,0>(vflag, buffers, fc);
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<1,1,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<1,1,0>(vflag, buffers, fc);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_bond)
|
||||
eval<0,0,1>(vflag, buffers, fc);
|
||||
else
|
||||
eval<0,0,0>(vflag, buffers, fc);
|
||||
}
|
||||
}
|
||||
|
||||
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void DihedralFourierIntel::eval(const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
|
||||
{
|
||||
const int inum = neighbor->ndihedrallist;
|
||||
if (inum == 0) return;
|
||||
|
||||
ATOM_T * _noalias const x = buffers->get_x(0);
|
||||
const int nlocal = atom->nlocal;
|
||||
const int nall = nlocal + atom->nghost;
|
||||
|
||||
int f_stride;
|
||||
if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
|
||||
else f_stride = buffers->get_stride(nlocal);
|
||||
|
||||
int tc;
|
||||
FORCE_T * _noalias f_start;
|
||||
acc_t * _noalias ev_global;
|
||||
IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
|
||||
const int nthreads = tc;
|
||||
|
||||
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EFLAG) oedihedral = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
|
||||
}
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel default(none) \
|
||||
shared(f_start,f_stride,fc) \
|
||||
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int nfrom, npl, nto, tid;
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
|
||||
#else
|
||||
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
|
||||
#endif
|
||||
|
||||
FORCE_T * _noalias const f = f_start + (tid * f_stride);
|
||||
if (fix->need_zero(tid))
|
||||
memset(f, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
const int5_t * _noalias const dihedrallist =
|
||||
(int5_t *) neighbor->dihedrallist[0];
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
if (EFLAG) sedihedral = (acc_t)0.0;
|
||||
if (VFLAG && vflag) {
|
||||
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
|
||||
}
|
||||
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
for (int n = nfrom; n < nto; n ++) {
|
||||
#else
|
||||
for (int n = nfrom; n < nto; n += npl) {
|
||||
#endif
|
||||
const int i1 = dihedrallist[n].a;
|
||||
const int i2 = dihedrallist[n].b;
|
||||
const int i3 = dihedrallist[n].c;
|
||||
const int i4 = dihedrallist[n].d;
|
||||
const int type = dihedrallist[n].t;
|
||||
|
||||
// 1st bond
|
||||
|
||||
const flt_t vb1x = x[i1].x - x[i2].x;
|
||||
const flt_t vb1y = x[i1].y - x[i2].y;
|
||||
const flt_t vb1z = x[i1].z - x[i2].z;
|
||||
|
||||
// 2nd bond
|
||||
|
||||
const flt_t vb2xm = x[i2].x - x[i3].x;
|
||||
const flt_t vb2ym = x[i2].y - x[i3].y;
|
||||
const flt_t vb2zm = x[i2].z - x[i3].z;
|
||||
|
||||
// 3rd bond
|
||||
|
||||
const flt_t vb3x = x[i4].x - x[i3].x;
|
||||
const flt_t vb3y = x[i4].y - x[i3].y;
|
||||
const flt_t vb3z = x[i4].z - x[i3].z;
|
||||
|
||||
// c,s calculation
|
||||
|
||||
const flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
|
||||
const flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
|
||||
const flt_t az = vb1x*vb2ym - vb1y*vb2xm;
|
||||
const flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
|
||||
const flt_t by = vb3z*vb2xm - vb3x*vb2zm;
|
||||
const flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
|
||||
|
||||
const flt_t rasq = ax*ax + ay*ay + az*az;
|
||||
const flt_t rbsq = bx*bx + by*by + bz*bz;
|
||||
const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
|
||||
const flt_t rg = sqrt(rgsq);
|
||||
|
||||
flt_t rginv, ra2inv, rb2inv;
|
||||
rginv = ra2inv = rb2inv = (flt_t)0.0;
|
||||
if (rg > 0) rginv = (flt_t)1.0/rg;
|
||||
if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
|
||||
if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
|
||||
const flt_t rabinv = sqrt(ra2inv*rb2inv);
|
||||
|
||||
flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
|
||||
const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
|
||||
|
||||
// error check
|
||||
#ifndef LMP_INTEL_USE_SIMDOFF
|
||||
if (c > PTOLERANCE || c < MTOLERANCE) {
|
||||
int me = comm->me;
|
||||
|
||||
if (screen) {
|
||||
char str[128];
|
||||
sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
|
||||
TAGINT_FORMAT " " TAGINT_FORMAT " "
|
||||
TAGINT_FORMAT " " TAGINT_FORMAT,
|
||||
me,tid,update->ntimestep,
|
||||
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
|
||||
error->warning(FLERR,str,0);
|
||||
fprintf(screen," 1st atom: %d %g %g %g\n",
|
||||
me,x[i1].x,x[i1].y,x[i1].z);
|
||||
fprintf(screen," 2nd atom: %d %g %g %g\n",
|
||||
me,x[i2].x,x[i2].y,x[i2].z);
|
||||
fprintf(screen," 3rd atom: %d %g %g %g\n",
|
||||
me,x[i3].x,x[i3].y,x[i3].z);
|
||||
fprintf(screen," 4th atom: %d %g %g %g\n",
|
||||
me,x[i4].x,x[i4].y,x[i4].z);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (c > (flt_t)1.0) c = (flt_t)1.0;
|
||||
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
|
||||
|
||||
flt_t deng;
|
||||
flt_t df = (flt_t)0.0;
|
||||
if (EFLAG) deng = (flt_t)0.0;
|
||||
|
||||
for (int j = 0; j < nterms[type]; j++) {
|
||||
const flt_t tcos_shift = fc.bp[j][type].cos_shift;
|
||||
const flt_t tsin_shift = fc.bp[j][type].sin_shift;
|
||||
const flt_t tk = fc.bp[j][type].k;
|
||||
const int m = fc.bp[j][type].multiplicity;
|
||||
|
||||
flt_t p = (flt_t)1.0;
|
||||
flt_t ddf1, df1;
|
||||
ddf1 = df1 = (flt_t)0.0;
|
||||
|
||||
for (int i = 0; i < m; i++) {
|
||||
ddf1 = p*c - df1*s;
|
||||
df1 = p*s + df1*c;
|
||||
p = ddf1;
|
||||
}
|
||||
|
||||
p = p*tcos_shift + df1*tsin_shift;
|
||||
df1 = df1*tcos_shift - ddf1*tsin_shift;
|
||||
df1 *= -m;
|
||||
p += (flt_t)1.0;
|
||||
|
||||
if (m == 0) {
|
||||
p = (flt_t)1.0 + tcos_shift;
|
||||
df1 = (flt_t)0.0;
|
||||
}
|
||||
|
||||
if (EFLAG) deng += tk * p;
|
||||
df -= tk * df1;
|
||||
}
|
||||
|
||||
const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
|
||||
const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
|
||||
const flt_t fga = fg*ra2inv*rginv;
|
||||
const flt_t hgb = hg*rb2inv*rginv;
|
||||
const flt_t gaa = -ra2inv*rg;
|
||||
const flt_t gbb = rb2inv*rg;
|
||||
|
||||
const flt_t dtfx = gaa*ax;
|
||||
const flt_t dtfy = gaa*ay;
|
||||
const flt_t dtfz = gaa*az;
|
||||
const flt_t dtgx = fga*ax - hgb*bx;
|
||||
const flt_t dtgy = fga*ay - hgb*by;
|
||||
const flt_t dtgz = fga*az - hgb*bz;
|
||||
const flt_t dthx = gbb*bx;
|
||||
const flt_t dthy = gbb*by;
|
||||
const flt_t dthz = gbb*bz;
|
||||
|
||||
const flt_t sx2 = df*dtgx;
|
||||
const flt_t sy2 = df*dtgy;
|
||||
const flt_t sz2 = df*dtgz;
|
||||
|
||||
flt_t f1x = df*dtfx;
|
||||
flt_t f1y = df*dtfy;
|
||||
flt_t f1z = df*dtfz;
|
||||
|
||||
const flt_t f2x = sx2 - f1x;
|
||||
const flt_t f2y = sy2 - f1y;
|
||||
const flt_t f2z = sz2 - f1z;
|
||||
|
||||
flt_t f4x = df*dthx;
|
||||
flt_t f4y = df*dthy;
|
||||
flt_t f4z = df*dthz;
|
||||
|
||||
const flt_t f3x = -sx2 - f4x;
|
||||
const flt_t f3y = -sy2 - f4y;
|
||||
const flt_t f3z = -sz2 - f4z;
|
||||
|
||||
if (EFLAG || VFLAG) {
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
|
||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||
vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
|
||||
sv0, sv1, sv2, sv3, sv4, sv5);
|
||||
#else
|
||||
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
|
||||
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
|
||||
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
|
||||
vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
#pragma simdoff
|
||||
#endif
|
||||
{
|
||||
if (NEWTON_BOND || i1 < nlocal) {
|
||||
f[i1].x += f1x;
|
||||
f[i1].y += f1y;
|
||||
f[i1].z += f1z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i2 < nlocal) {
|
||||
f[i2].x += f2x;
|
||||
f[i2].y += f2y;
|
||||
f[i2].z += f2z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i3 < nlocal) {
|
||||
f[i3].x += f3x;
|
||||
f[i3].y += f3y;
|
||||
f[i3].z += f3z;
|
||||
}
|
||||
|
||||
if (NEWTON_BOND || i4 < nlocal) {
|
||||
f[i4].x += f4x;
|
||||
f[i4].y += f4y;
|
||||
f[i4].z += f4z;
|
||||
}
|
||||
}
|
||||
} // for n
|
||||
#ifdef LMP_INTEL_USE_SIMDOFF
|
||||
if (EFLAG) oedihedral += sedihedral;
|
||||
if (VFLAG && vflag) {
|
||||
ov0 += sv0; ov1 += sv1; ov2 += sv2;
|
||||
ov3 += sv3; ov4 += sv4; ov5 += sv5;
|
||||
}
|
||||
#endif
|
||||
} // omp parallel
|
||||
|
||||
if (EFLAG) energy += oedihedral;
|
||||
if (VFLAG && vflag) {
|
||||
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
|
||||
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
|
||||
}
|
||||
|
||||
fix->set_reduce_flag();
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void DihedralFourierIntel::init_style()
|
||||
{
|
||||
DihedralFourier::init_style();
|
||||
|
||||
int ifix = modify->find_fix("package_intel");
|
||||
if (ifix < 0)
|
||||
error->all(FLERR,
|
||||
"The 'package intel' command is required for /intel styles");
|
||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
_use_base = 0;
|
||||
if (fix->offload_balance() != 0.0) {
|
||||
_use_base = 1;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
fix->bond_init_check();
|
||||
|
||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
||||
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
pack_force_const(force_const_double, fix->get_double_buffers());
|
||||
else
|
||||
pack_force_const(force_const_single, fix->get_single_buffers());
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void DihedralFourierIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t,acc_t> *buffers)
|
||||
{
|
||||
const int bp1 = atom->ndihedraltypes + 1;
|
||||
fc.set_ntypes(bp1, setflag, nterms, memory);
|
||||
|
||||
for (int i = 1; i < bp1; i++) {
|
||||
if (setflag[i]) {
|
||||
for (int j = 0; j < nterms[i]; j++) {
|
||||
fc.bp[j][i].cos_shift = cos_shift[i][j];
|
||||
fc.bp[j][i].sin_shift = sin_shift[i][j];
|
||||
fc.bp[j][i].k = k[i][j];
|
||||
fc.bp[j][i].multiplicity = multiplicity[i][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t>
|
||||
void DihedralFourierIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
|
||||
int *setflag,
|
||||
int *nterms,
|
||||
Memory *memory) {
|
||||
if (nbondtypes != _nbondtypes) {
|
||||
if (_nbondtypes > 0)
|
||||
_memory->destroy(bp);
|
||||
|
||||
if (nbondtypes > 0) {
|
||||
_maxnterms = 1;
|
||||
for (int i = 1; i <= nbondtypes; i++)
|
||||
if (setflag[i]) _maxnterms = MAX(_maxnterms, nterms[i]);
|
||||
|
||||
_memory->create(bp, _maxnterms, nbondtypes, "dihedralfourierintel.bp");
|
||||
}
|
||||
}
|
||||
_nbondtypes = nbondtypes;
|
||||
_memory = memory;
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef DIHEDRAL_CLASS
|
||||
|
||||
DihedralStyle(fourier/intel,DihedralFourierIntel)
|
||||
|
||||
#else
|
||||
|
||||
#ifndef LMP_DIHEDRAL_FOURIER_INTEL_H
|
||||
#define LMP_DIHEDRAL_FOURIER_INTEL_H
|
||||
|
||||
#include "dihedral_fourier.h"
|
||||
#include "fix_intel.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class DihedralFourierIntel : public DihedralFourier {
|
||||
|
||||
public:
|
||||
DihedralFourierIntel(class LAMMPS *lmp);
|
||||
virtual void compute(int, int);
|
||||
void init_style();
|
||||
|
||||
private:
|
||||
FixIntel *fix;
|
||||
|
||||
template <class flt_t> class ForceConst;
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
|
||||
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <class flt_t, class acc_t>
|
||||
void pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t, acc_t> *buffers);
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
int _use_base;
|
||||
#endif
|
||||
|
||||
template <class flt_t>
|
||||
class ForceConst {
|
||||
public:
|
||||
typedef struct { flt_t cos_shift, sin_shift, k;
|
||||
int multiplicity; } fc_packed1;
|
||||
|
||||
fc_packed1 **bp;
|
||||
|
||||
ForceConst() : _nbondtypes(0) {}
|
||||
~ForceConst() { set_ntypes(0, NULL, NULL, NULL); }
|
||||
|
||||
void set_ntypes(const int nbondtypes, int *setflag, int *nterms,
|
||||
Memory *memory);
|
||||
|
||||
private:
|
||||
int _nbondtypes, _maxnterms;
|
||||
Memory *_memory;
|
||||
};
|
||||
ForceConst<float> force_const_single;
|
||||
ForceConst<double> force_const_double;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
|
@ -285,6 +285,7 @@ int FixIntel::setmask()
|
|||
{
|
||||
int mask = 0;
|
||||
mask |= PRE_REVERSE;
|
||||
mask |= MIN_PRE_REVERSE;
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
mask |= POST_FORCE;
|
||||
mask |= MIN_POST_FORCE;
|
||||
|
|
|
@ -43,6 +43,7 @@ class FixIntel : public Fix {
|
|||
virtual int setmask();
|
||||
virtual void init();
|
||||
virtual void setup(int);
|
||||
inline void min_setup(int in) { setup(in); }
|
||||
void setup_pre_reverse(int eflag = 0, int vflag = 0);
|
||||
|
||||
void pair_init_check(const bool cdmessage=false);
|
||||
|
@ -50,6 +51,8 @@ class FixIntel : public Fix {
|
|||
void kspace_init_check();
|
||||
|
||||
void pre_reverse(int eflag = 0, int vflag = 0);
|
||||
inline void min_pre_reverse(int eflag = 0, int vflag = 0)
|
||||
{ pre_reverse(eflag, vflag); }
|
||||
|
||||
// Get all forces, calculation results from coprocesser
|
||||
void sync_coprocessor();
|
||||
|
|
|
@ -409,6 +409,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
|||
IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0);
|
||||
lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
|
||||
#endif
|
||||
memset(_ccachei, 0, vsize * sizeof(int));
|
||||
memset(_ccachej, 0, vsize * sizeof(int));
|
||||
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
|
@ -425,7 +426,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
|
|||
#pragma offload_transfer target(mic:_cop) \
|
||||
nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
|
||||
nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
|
||||
nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
|
||||
in(ccachei:length(vsize) alloc_if(1) free_if(0)) \
|
||||
in(ccachej:length(vsize) alloc_if(1) free_if(0))
|
||||
}
|
||||
#ifdef LMP_USE_AVXCD
|
||||
|
|
|
@ -292,6 +292,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
|||
ito = inum; \
|
||||
}
|
||||
|
||||
#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum, \
|
||||
nthr, vecsize) \
|
||||
{ \
|
||||
tid = 0; \
|
||||
ifrom = 0; \
|
||||
ip = 1; \
|
||||
ito = inum; \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \
|
||||
|
|
|
@ -319,7 +319,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
|
|||
const int bstart = binhead[ibin + binstart[k]];
|
||||
const int bend = binhead[ibin + binend[k]];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++)
|
||||
|
@ -341,7 +340,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
|
|||
const int bstart = binhead[ibin + stencil[k]];
|
||||
const int bend = binhead[ibin + stencil[k] + 1];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++)
|
||||
|
|
|
@ -273,7 +273,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||
const int bstart = binhead[ibin + binstart[k]];
|
||||
const int bend = binhead[ibin + binend[k]];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++)
|
||||
|
@ -307,7 +306,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
|
|||
const int bstart = binhead[ibin];
|
||||
const int bend = binhead[ibin + 1];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int jj = bstart; jj < bend; jj++) {
|
||||
|
|
|
@ -0,0 +1,617 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
This software is distributed under the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
Shun Xu (Computer Network Information Center, CAS)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <math.h>
|
||||
#include "pair_dpd_intel.h"
|
||||
#include "atom.h"
|
||||
#include "comm.h"
|
||||
#include "force.h"
|
||||
#include "memory.h"
|
||||
#include "modify.h"
|
||||
#include "neighbor.h"
|
||||
#include "neigh_list.h"
|
||||
#include "neigh_request.h"
|
||||
#include "suffix.h"
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
#define LMP_MKL_RNG VSL_BRNG_MT19937
|
||||
#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
|
||||
#define IEPSILON 1.0e10
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
PairDPDIntel::PairDPDIntel(LAMMPS *lmp) :
|
||||
PairDPD(lmp)
|
||||
{
|
||||
suffix_flag |= Suffix::INTEL;
|
||||
respa_enable = 0;
|
||||
random_thread = NULL;
|
||||
_nrandom_thread = 0;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
PairDPDIntel::~PairDPDIntel()
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
if (_nrandom_thread) {
|
||||
#ifdef LMP_NO_MKL_RNG
|
||||
for (int i = 1; i < _nrandom_thread; i++)
|
||||
delete random_thread[i];
|
||||
#else
|
||||
for (int i = 0; i < _nrandom_thread; i++)
|
||||
vslDeleteStream(&random_thread[i]);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
delete []random_thread;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void PairDPDIntel::compute(int eflag, int vflag)
|
||||
{
|
||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
|
||||
force_const_single);
|
||||
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
|
||||
force_const_double);
|
||||
else
|
||||
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
|
||||
force_const_single);
|
||||
|
||||
fix->balance_stamp();
|
||||
vflag_fdotr = 0;
|
||||
}
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void PairDPDIntel::compute(int eflag, int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc)
|
||||
{
|
||||
if (eflag || vflag) {
|
||||
ev_setup(eflag, vflag);
|
||||
} else evflag = vflag_fdotr = 0;
|
||||
|
||||
const int inum = list->inum;
|
||||
const int nthreads = comm->nthreads;
|
||||
const int host_start = fix->host_start_pair();
|
||||
const int offload_end = fix->offload_end_pair();
|
||||
const int ago = neighbor->ago;
|
||||
|
||||
if (ago != 0 && fix->separate_buffers() == 0) {
|
||||
fix->start_watch(TIME_PACK);
|
||||
|
||||
int packthreads;
|
||||
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
|
||||
else packthreads = 1;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel if(packthreads > 1)
|
||||
#endif
|
||||
{
|
||||
int ifrom, ito, tid;
|
||||
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
|
||||
packthreads, sizeof(ATOM_T));
|
||||
buffers->thr_pack(ifrom,ito,ago);
|
||||
}
|
||||
fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
int ovflag = 0;
|
||||
if (vflag_fdotr) ovflag = 2;
|
||||
else if (vflag) ovflag = 1;
|
||||
if (_onetype) {
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (eflag) {
|
||||
if (force->newton_pair) {
|
||||
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
} else {
|
||||
if (force->newton_pair) {
|
||||
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
|
||||
} else {
|
||||
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
|
||||
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void PairDPDIntel::eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc,
|
||||
const int astart, const int aend)
|
||||
{
|
||||
const int inum = aend - astart;
|
||||
if (inum == 0) return;
|
||||
int nlocal, nall, minlocal;
|
||||
fix->get_buffern(offload, nlocal, nall, minlocal);
|
||||
|
||||
const int ago = neighbor->ago;
|
||||
IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
|
||||
|
||||
ATOM_T * _noalias const x = buffers->get_x(offload);
|
||||
typedef struct { double x, y, z; } lmp_vt;
|
||||
lmp_vt *v = (lmp_vt *)atom->v[0];
|
||||
const flt_t dtinvsqrt = 1.0/sqrt(update->dt);
|
||||
|
||||
const int * _noalias const numneigh = list->numneigh;
|
||||
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
|
||||
const int * _noalias const firstneigh = buffers->firstneigh(list);
|
||||
const FC_PACKED1_T * _noalias const param = fc.param[0];
|
||||
const flt_t * _noalias const special_lj = fc.special_lj;
|
||||
int * _noalias const rngi_thread = fc.rngi;
|
||||
const int rng_size = buffers->get_max_nbors();
|
||||
|
||||
const int ntypes = atom->ntypes + 1;
|
||||
const int eatom = this->eflag_atom;
|
||||
|
||||
// Determine how much data to transfer
|
||||
int x_size, q_size, f_stride, ev_size, separate_flag;
|
||||
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
|
||||
buffers, offload, fix, separate_flag,
|
||||
x_size, q_size, ev_size, f_stride);
|
||||
|
||||
int tc;
|
||||
FORCE_T * _noalias f_start;
|
||||
acc_t * _noalias ev_global;
|
||||
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
|
||||
const int nthreads = tc;
|
||||
int *overflow = fix->get_off_overflow_flag();
|
||||
{
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime();
|
||||
#endif
|
||||
|
||||
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
|
||||
f_stride, x, 0);
|
||||
|
||||
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
|
||||
if (EFLAG) oevdwl = (acc_t)0;
|
||||
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
|
||||
|
||||
// loop over neighbors of my atoms
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
|
||||
#endif
|
||||
{
|
||||
int iifrom, iip, iito, tid;
|
||||
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
|
||||
iifrom += astart;
|
||||
iito += astart;
|
||||
|
||||
#ifdef LMP_NO_MKL_RNG
|
||||
RanMars *my_random = random_thread[tid];
|
||||
#else
|
||||
VSLStreamStatePtr *my_random = &(random_thread[tid]);
|
||||
#endif
|
||||
flt_t *my_rand_buffer = fc.rand_buffer_thread[tid];
|
||||
int rngi = rngi_thread[tid];
|
||||
|
||||
int foff;
|
||||
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
|
||||
else foff = -minlocal;
|
||||
FORCE_T * _noalias const f = f_start + foff;
|
||||
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
|
||||
|
||||
flt_t icut, a0, gamma, sigma;
|
||||
if (ONETYPE) {
|
||||
icut = param[3].icut;
|
||||
a0 = param[3].a0;
|
||||
gamma = param[3].gamma;
|
||||
sigma = param[3].sigma;
|
||||
}
|
||||
for (int i = iifrom; i < iito; i += iip) {
|
||||
int itype, ptr_off;
|
||||
const FC_PACKED1_T * _noalias parami;
|
||||
if (!ONETYPE) {
|
||||
itype = x[i].w;
|
||||
ptr_off = itype * ntypes;
|
||||
parami = param + ptr_off;
|
||||
}
|
||||
|
||||
const int * _noalias const jlist = firstneigh + cnumneigh[i];
|
||||
const int jnum = numneigh[i];
|
||||
|
||||
acc_t fxtmp, fytmp, fztmp, fwtmp;
|
||||
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
|
||||
|
||||
const flt_t xtmp = x[i].x;
|
||||
const flt_t ytmp = x[i].y;
|
||||
const flt_t ztmp = x[i].z;
|
||||
const flt_t vxtmp = v[i].x;
|
||||
const flt_t vytmp = v[i].y;
|
||||
const flt_t vztmp = v[i].z;
|
||||
fxtmp = fytmp = fztmp = (acc_t)0;
|
||||
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
|
||||
if (NEWTON_PAIR == 0)
|
||||
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
|
||||
|
||||
if (rngi + jnum > rng_size) {
|
||||
#ifdef LMP_NO_MKL_RNG
|
||||
for (int jj = 0; jj < rngi; jj++)
|
||||
my_rand_buffer[jj] = my_random->gaussian();
|
||||
#else
|
||||
if (sizeof(flt_t) == sizeof(float))
|
||||
vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi,
|
||||
(float*)my_rand_buffer, (float)0.0, (float)1.0 );
|
||||
else
|
||||
vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi,
|
||||
(double*)my_rand_buffer, 0.0, 1.0 );
|
||||
#endif
|
||||
rngi = 0;
|
||||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
|
||||
sv0, sv1, sv2, sv3, sv4, sv5)
|
||||
#endif
|
||||
for (int jj = 0; jj < jnum; jj++) {
|
||||
flt_t forcelj, evdwl;
|
||||
forcelj = evdwl = (flt_t)0.0;
|
||||
|
||||
int j, jtype, sbindex;
|
||||
if (!ONETYPE) {
|
||||
sbindex = jlist[jj] >> SBBITS & 3;
|
||||
j = jlist[jj] & NEIGHMASK;
|
||||
} else
|
||||
j = jlist[jj];
|
||||
|
||||
const flt_t delx = xtmp - x[j].x;
|
||||
const flt_t dely = ytmp - x[j].y;
|
||||
const flt_t delz = ztmp - x[j].z;
|
||||
if (!ONETYPE) {
|
||||
jtype = x[j].w;
|
||||
icut = parami[jtype].icut;
|
||||
}
|
||||
const flt_t rsq = delx * delx + dely * dely + delz * delz;
|
||||
const flt_t rinv = (flt_t)1.0/sqrt(rsq);
|
||||
|
||||
if (rinv > icut) {
|
||||
flt_t factor_dpd;
|
||||
if (!ONETYPE) factor_dpd = special_lj[sbindex];
|
||||
|
||||
flt_t delvx = vxtmp - v[j].x;
|
||||
flt_t delvy = vytmp - v[j].y;
|
||||
flt_t delvz = vztmp - v[j].z;
|
||||
flt_t dot = delx*delvx + dely*delvy + delz*delvz;
|
||||
flt_t randnum = my_rand_buffer[jj];
|
||||
|
||||
flt_t iwd = rinv - icut;
|
||||
if (rinv > (flt_t)IEPSILON) iwd = (flt_t)0.0;
|
||||
|
||||
if (!ONETYPE) {
|
||||
a0 = parami[jtype].a0;
|
||||
gamma = parami[jtype].gamma;
|
||||
sigma = parami[jtype].sigma;
|
||||
}
|
||||
flt_t fpair = a0 - iwd * gamma * dot + sigma * randnum * dtinvsqrt;
|
||||
if (!ONETYPE) fpair *= factor_dpd;
|
||||
fpair *= iwd;
|
||||
|
||||
const flt_t fpx = fpair * delx;
|
||||
fxtmp += fpx;
|
||||
if (NEWTON_PAIR) f[j].x -= fpx;
|
||||
const flt_t fpy = fpair * dely;
|
||||
fytmp += fpy;
|
||||
if (NEWTON_PAIR) f[j].y -= fpy;
|
||||
const flt_t fpz = fpair * delz;
|
||||
fztmp += fpz;
|
||||
if (NEWTON_PAIR) f[j].z -= fpz;
|
||||
|
||||
if (EFLAG) {
|
||||
flt_t cut = (flt_t)1.0/icut;
|
||||
flt_t r = (flt_t)1.0/rinv;
|
||||
evdwl = (flt_t)0.5 * a0 * (cut - (flt_t)2.0*r + rsq * icut);
|
||||
if (!ONETYPE) evdwl *= factor_dpd;
|
||||
sevdwl += evdwl;
|
||||
if (eatom) {
|
||||
fwtmp += (flt_t)0.5 * evdwl;
|
||||
if (NEWTON_PAIR)
|
||||
f[j].w += (flt_t)0.5 * evdwl;
|
||||
}
|
||||
}
|
||||
|
||||
if (NEWTON_PAIR == 0)
|
||||
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
|
||||
} // if rsq
|
||||
} // for jj
|
||||
if (NEWTON_PAIR) {
|
||||
f[i].x += fxtmp;
|
||||
f[i].y += fytmp;
|
||||
f[i].z += fztmp;
|
||||
} else {
|
||||
f[i].x = fxtmp;
|
||||
f[i].y = fytmp;
|
||||
f[i].z = fztmp;
|
||||
}
|
||||
|
||||
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
|
||||
rngi += jnum;
|
||||
} // for ii
|
||||
|
||||
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
|
||||
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
|
||||
ov4, ov5);
|
||||
rngi_thread[tid] = rngi;
|
||||
} // end omp
|
||||
|
||||
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
|
||||
ov0, ov1, ov2, ov3, ov4, ov5);
|
||||
|
||||
if (EFLAG) {
|
||||
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
|
||||
ev_global[0] = oevdwl;
|
||||
ev_global[1] = (acc_t)0.0;
|
||||
}
|
||||
if (vflag) {
|
||||
if (NEWTON_PAIR == 0) {
|
||||
ov0 *= (acc_t)0.5;
|
||||
ov1 *= (acc_t)0.5;
|
||||
ov2 *= (acc_t)0.5;
|
||||
ov3 *= (acc_t)0.5;
|
||||
ov4 *= (acc_t)0.5;
|
||||
ov5 *= (acc_t)0.5;
|
||||
}
|
||||
ev_global[2] = ov0;
|
||||
ev_global[3] = ov1;
|
||||
ev_global[4] = ov2;
|
||||
ev_global[5] = ov3;
|
||||
ev_global[6] = ov4;
|
||||
ev_global[7] = ov5;
|
||||
}
|
||||
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
|
||||
*timer_compute = MIC_Wtime() - *timer_compute;
|
||||
#endif
|
||||
} // end offload
|
||||
|
||||
if (offload)
|
||||
fix->stop_watch(TIME_OFFLOAD_LATENCY);
|
||||
else
|
||||
fix->stop_watch(TIME_HOST_PAIR);
|
||||
|
||||
if (EFLAG || vflag)
|
||||
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
|
||||
else
|
||||
fix->add_result_array(f_start, 0, offload);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
global settings
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PairDPDIntel::settings(int narg, char **arg) {
|
||||
#if defined(_OPENMP)
|
||||
if (_nrandom_thread) {
|
||||
#ifdef LMP_NO_MKL_RNG
|
||||
for (int i = 1; i < _nrandom_thread; i++)
|
||||
delete random_thread[i];
|
||||
#else
|
||||
for (int i = 0; i < _nrandom_thread; i++)
|
||||
vslDeleteStream(&random_thread[i]);
|
||||
#endif
|
||||
}
|
||||
delete []random_thread;
|
||||
#endif
|
||||
PairDPD::settings(narg,arg);
|
||||
_nrandom_thread = comm->nthreads;
|
||||
|
||||
#ifdef LMP_NO_MKL_RNG
|
||||
|
||||
random_thread =new RanMars*[comm->nthreads];
|
||||
random_thread[0] = random;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
if (tid > 0)
|
||||
random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
random_thread=new VSLStreamStatePtr[comm->nthreads];
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
vslNewStream(&random_thread[tid], LMP_MKL_RNG,
|
||||
seed + comm->me + comm->nprocs * tid );
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void PairDPDIntel::init_style()
|
||||
{
|
||||
PairDPD::init_style();
|
||||
if (force->newton_pair == 0) {
|
||||
neighbor->requests[neighbor->nrequest-1]->half = 0;
|
||||
neighbor->requests[neighbor->nrequest-1]->full = 1;
|
||||
}
|
||||
neighbor->requests[neighbor->nrequest-1]->intel = 1;
|
||||
|
||||
int ifix = modify->find_fix("package_intel");
|
||||
if (ifix < 0)
|
||||
error->all(FLERR,
|
||||
"The 'package intel' command is required for /intel styles");
|
||||
fix = static_cast<FixIntel *>(modify->fix[ifix]);
|
||||
|
||||
fix->pair_init_check();
|
||||
#ifdef _LMP_INTEL_OFFLOAD
|
||||
if (fix->offload_balance() != 0.0)
|
||||
error->all(FLERR,
|
||||
"Offload for dpd/intel is not yet available. Set balance to 0.");
|
||||
#endif
|
||||
|
||||
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
|
||||
pack_force_const(force_const_single, fix->get_mixed_buffers());
|
||||
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
|
||||
pack_force_const(force_const_double, fix->get_double_buffers());
|
||||
else
|
||||
pack_force_const(force_const_single, fix->get_single_buffers());
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void PairDPDIntel::pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t,acc_t> *buffers)
|
||||
{
|
||||
_onetype = 0;
|
||||
if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
|
||||
|
||||
int tp1 = atom->ntypes + 1;
|
||||
fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop);
|
||||
buffers->set_ntypes(tp1);
|
||||
flt_t **cutneighsq = buffers->get_cutneighsq();
|
||||
|
||||
// Repeat cutsq calculation because done after call to init_style
|
||||
double cut, cutneigh;
|
||||
for (int i = 1; i <= atom->ntypes; i++) {
|
||||
for (int j = i; j <= atom->ntypes; j++) {
|
||||
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
|
||||
cut = init_one(i,j);
|
||||
cutneigh = cut + neighbor->skin;
|
||||
cutsq[i][j] = cutsq[j][i] = cut*cut;
|
||||
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
|
||||
double icut = 1.0 / cut;
|
||||
fc.param[i][j].icut = fc.param[j][i].icut = icut;
|
||||
} else {
|
||||
cut = init_one(i,j);
|
||||
double icut = 1.0 / cut;
|
||||
fc.param[i][j].icut = fc.param[j][i].icut = icut;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
fc.special_lj[i] = force->special_lj[i];
|
||||
fc.special_lj[0] = 1.0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < tp1; i++) {
|
||||
for (int j = 0; j < tp1; j++) {
|
||||
fc.param[i][j].a0 = a0[i][j];
|
||||
fc.param[i][j].gamma = gamma[i][j];
|
||||
fc.param[i][j].sigma = sigma[i][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template <class flt_t>
|
||||
void PairDPDIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
|
||||
const int nthreads,
|
||||
const int max_nbors,
|
||||
Memory *memory,
|
||||
const int cop) {
|
||||
if (ntypes != _ntypes) {
|
||||
if (_ntypes > 0) {
|
||||
_memory->destroy(param);
|
||||
_memory->destroy(rand_buffer_thread);
|
||||
_memory->destroy(rngi);
|
||||
}
|
||||
if (ntypes > 0) {
|
||||
_cop = cop;
|
||||
memory->create(param,ntypes,ntypes,"fc.param");
|
||||
memory->create(rand_buffer_thread, nthreads, max_nbors,
|
||||
"fc.rand_buffer_thread");
|
||||
memory->create(rngi,nthreads,"fc.param");
|
||||
for (int i = 0; i < nthreads; i++) rngi[i] = max_nbors;
|
||||
}
|
||||
}
|
||||
_ntypes = ntypes;
|
||||
_memory = memory;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
proc 0 reads from restart file, bcasts
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PairDPDIntel::read_restart_settings(FILE *fp)
|
||||
{
|
||||
#if defined(_OPENMP)
|
||||
if (_nrandom_thread) {
|
||||
#ifdef LMP_NO_MKL_RNG
|
||||
for (int i = 1; i < _nrandom_thread; i++)
|
||||
delete random_thread[i];
|
||||
#else
|
||||
for (int i = 0; i < _nrandom_thread; i++)
|
||||
vslDeleteStream(&random_thread[i]);
|
||||
#endif
|
||||
}
|
||||
delete []random_thread;
|
||||
#endif
|
||||
PairDPD::read_restart_settings(fp);
|
||||
_nrandom_thread = comm->nthreads;
|
||||
|
||||
#ifdef LMP_NO_MKL_RNG
|
||||
|
||||
random_thread =new RanMars*[comm->nthreads];
|
||||
random_thread[0] = random;
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
if (tid > 0)
|
||||
random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
random_thread=new VSLStreamStatePtr[comm->nthreads];
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
vslNewStream(&random_thread[tid], LMP_MKL_RNG,
|
||||
seed + comm->me + comm->nprocs * tid );
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
|
@ -0,0 +1,110 @@
|
|||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: W. Michael Brown (Intel)
|
||||
Shun Xu (Computer Network Information Center, CAS)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef PAIR_CLASS
|
||||
|
||||
PairStyle(dpd/intel,PairDPDIntel)
|
||||
|
||||
#else
|
||||
|
||||
#ifndef LMP_PAIR_DPD_INTEL_H
|
||||
#define LMP_PAIR_DPD_INTEL_H
|
||||
|
||||
#include "pair_dpd.h"
|
||||
#include "fix_intel.h"
|
||||
|
||||
#ifdef LMP_NO_MKL_RNG
|
||||
#include "random_mars.h"
|
||||
#else
|
||||
#include "mkl_vsl.h"
|
||||
#endif
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class PairDPDIntel : public PairDPD {
|
||||
|
||||
public:
|
||||
PairDPDIntel(class LAMMPS *);
|
||||
~PairDPDIntel();
|
||||
|
||||
virtual void compute(int, int);
|
||||
void settings(int, char **);
|
||||
void init_style();
|
||||
void read_restart_settings(FILE *);
|
||||
|
||||
private:
|
||||
FixIntel *fix;
|
||||
int _cop, _onetype, _nrandom_thread;
|
||||
|
||||
#ifdef LMP_NO_MKL_RNG
|
||||
RanMars **random_thread;
|
||||
#else
|
||||
VSLStreamStatePtr *random_thread;
|
||||
#endif
|
||||
|
||||
template <class flt_t> class ForceConst;
|
||||
template <class flt_t, class acc_t>
|
||||
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
|
||||
const ForceConst<flt_t> &fc);
|
||||
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
|
||||
void eval(const int offload, const int vflag,
|
||||
IntelBuffers<flt_t,acc_t> * buffers,
|
||||
const ForceConst<flt_t> &fc, const int astart, const int aend);
|
||||
|
||||
template <class flt_t, class acc_t>
|
||||
void pack_force_const(ForceConst<flt_t> &fc,
|
||||
IntelBuffers<flt_t, acc_t> *buffers);
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
template <class flt_t>
|
||||
class ForceConst {
|
||||
public:
|
||||
typedef struct { flt_t icut, a0, gamma, sigma; } fc_packed1;
|
||||
|
||||
_alignvar(flt_t special_lj[4],64);
|
||||
fc_packed1 **param;
|
||||
flt_t **rand_buffer_thread;
|
||||
int *rngi;
|
||||
|
||||
ForceConst() : _ntypes(0) {}
|
||||
~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
|
||||
|
||||
void set_ntypes(const int ntypes, const int nthreads, const int max_nbors,
|
||||
Memory *memory, const int cop);
|
||||
|
||||
private:
|
||||
int _ntypes, _cop;
|
||||
Memory *_memory;
|
||||
};
|
||||
ForceConst<float> force_const_single;
|
||||
ForceConst<double> force_const_double;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* ERROR/WARNING messages:
|
||||
|
||||
E: The 'package intel' command is required for /intel styles
|
||||
|
||||
Self-explanatory.
|
||||
|
||||
*/
|
Loading…
Reference in New Issue