Adding pair style dpd/intel and dihedral style fourier/intel

Adding raw performance numbers for Skylake xeon server.
Fixes for using older Intel compilers and compiling without OpenMP.
Fix adding in hooks for using USER-INTEL w/ minimization.
This commit is contained in:
Michael Brown 2017-10-02 04:53:17 -07:00
parent f2c1172741
commit cf24dd0265
17 changed files with 1345 additions and 19 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 19 KiB

View File

@ -25,12 +25,12 @@ LAMMPS to run on the CPU cores and coprocessor cores simultaneously.
[Currently Available USER-INTEL Styles:]
Angle Styles: charmm, harmonic :ulb,l
Bond Styles: fene, harmonic :l
Bond Styles: fene, fourier, harmonic :l
Dihedral Styles: charmm, harmonic, opls :l
Fixes: nve, npt, nvt, nvt/sllod :l
Improper Styles: cvff, harmonic :l
Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long,
buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm,
buck, dpd, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm,
lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
sw, tersoff :l
K-Space Styles: pppm, pppm/disp :l
@ -82,6 +82,10 @@ this order :l
The {newton} setting applies to all atoms, not just atoms shared
between MPI tasks :l
Vectorization can change the order for adding pairwise forces :l
Unless specified otherwise at build time, the random number
generator for dissipative particle dynamics uses the Mersenne
Twister generator (that should be more robust than the standard
generator) :l
:ule
The precision mode (described below) used with the USER-INTEL

View File

@ -7,6 +7,7 @@
:line
dihedral_style fourier command :h3
dihedral_style fourier/intel command :h3
dihedral_style fourier/omp command :h3
[Syntax:]

View File

@ -8,6 +8,7 @@
pair_style dpd command :h3
pair_style dpd/gpu command :h3
pair_style dpd/intel command :h3
pair_style dpd/omp command :h3
pair_style dpd/tstat command :h3
pair_style dpd/tstat/gpu command :h3

View File

@ -30,14 +30,15 @@ be added or changed in the Makefile depending on the version:
2017 update 2 - No changes needed
2017 updates 3 or 4 - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
2018 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high
2018 inital release - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
2018u1 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high
-----------------------------------------------------------------------------
When using the suffix command with "intel", intel styles will be used if they
exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP
USER-OMP styles will be used whenever USER-INTEL styles are not available. This
allow for running most styles in LAMMPS with threading.
is installed, USER-OMP styles will be used whenever USER-INTEL styles are not
available. This allow for running most styles in LAMMPS with threading.
-----------------------------------------------------------------------------
@ -52,6 +53,15 @@ need to be changed.
-----------------------------------------------------------------------------
The random number generator for Dissipative Particle Dynamics (DPD) in the
Intel package uses the Mersenne Twister pseudorandom number generator as
implemented in the Intel Math Kernel Library (MKL). This generator is faster
and more robust with a significantly longer period than the default DPD
generator. However, if MKL is not installed, the standard random number
generator can be used by adding the compile flag "-DLMP_NO_MKL_RNG".
-----------------------------------------------------------------------------
In order to use offload to Intel(R) Xeon Phi(TM) coprocessors, the flag
-DLMP_INTEL_OFFLOAD should be set in the Makefile. Offload requires the use of
Intel compilers.

View File

@ -9,6 +9,7 @@
# in.intel.tersoff - Silicon benchmark with Tersoff
# in.intel.water - Coarse-grain water benchmark using Stillinger-Weber
# in.intel.airebo - Polyethelene benchmark with AIREBO
# in.intel.dpd - Dissipative Particle Dynamics
#
#############################################################################
@ -16,16 +17,17 @@
# Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017
# - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi
#
# Xeon E5-2697v4 Xeon Phi 7250
# Xeon E5-2697v4 Xeon Phi 7250 Xeon Gold 6148
#
# in.intel.lj - 199.5 282.3
# in.intel.rhodo - 12.4 17.5
# in.intel.lc - 19.0 25.7
# in.intel.eam - 59.4 92.8
# in.intel.sw - 132.4 161.9
# in.intel.tersoff - 83.3 101.1
# in.intel.water - 53.4 90.3
# in.intel.airebo - 7.3 11.8
# in.intel.lj - 199.5 282.3 317.3
# in.intel.rhodo - 12.4 17.5 24.4
# in.intel.lc - 19.0 25.7 26.8
# in.intel.eam - 59.4 92.8 105.6
# in.intel.sw - 132.4 161.9 213.8
# in.intel.tersoff - 83.3 101.1 109.6
# in.intel.water - 53.4 90.3 105.5
# in.intel.airebo - 7.3 11.8 17.6
# in.intel.dpd - 74.5 100.4 148.1
#
#############################################################################

View File

@ -0,0 +1,48 @@
# DPD benchmark
variable N index on # Newton Setting
variable w index 10 # Warmup Timesteps
variable t index 4000 # Main Run Timesteps
variable m index 1 # Main Run Timestep Multiplier
variable n index 0 # Use NUMA Mapping for Multi-Node
variable p index 0 # Use Power Measurement
variable x index 4
variable y index 2
variable z index 2
variable xx equal 20*$x
variable yy equal 20*$y
variable zz equal 20*$z
variable rr equal floor($t*$m)
newton $N
if "$n > 0" then "processors * * * grid numa"
units lj
atom_style atomic
comm_modify mode single vel yes
lattice fcc 3.0
region box block 0 ${xx} 0 ${yy} 0 ${zz}
create_box 1 box
create_atoms 1 box
mass 1 1.0
velocity all create 1.0 87287 loop geom
pair_style dpd 1.0 1.0 928948
pair_coeff 1 1 25.0 4.5
neighbor 0.5 bin
neigh_modify delay 0 every 1
fix 1 all nve
timestep 0.04
thermo 1000
if "$p > 0" then "run_style verlet/power"
if "$w > 0" then "run $w"
run ${rr}

View File

@ -0,0 +1,441 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#include <mpi.h>
#include <math.h>
#include "dihedral_fourier_intel.h"
#include "atom.h"
#include "comm.h"
#include "memory.h"
#include "neighbor.h"
#include "domain.h"
#include "force.h"
#include "pair.h"
#include "update.h"
#include "error.h"
#include "suffix.h"
using namespace LAMMPS_NS;
#define PTOLERANCE (flt_t)1.05
#define MTOLERANCE (flt_t)-1.05
typedef struct { int a,b,c,d,t; } int5_t;
/* ---------------------------------------------------------------------- */
DihedralFourierIntel::DihedralFourierIntel(class LAMMPS *lmp)
: DihedralFourier(lmp)
{
suffix_flag |= Suffix::INTEL;
}
/* ---------------------------------------------------------------------- */
void DihedralFourierIntel::compute(int eflag, int vflag)
{
#ifdef _LMP_INTEL_OFFLOAD
if (_use_base) {
DihedralFourier::compute(eflag, vflag);
return;
}
#endif
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
force_const_single);
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
force_const_double);
else
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
force_const_single);
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void DihedralFourierIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc)
{
if (eflag || vflag) {
ev_setup(eflag,vflag);
} else evflag = 0;
if (evflag) {
if (vflag && !eflag) {
if (force->newton_bond)
eval<0,1,1>(vflag, buffers, fc);
else
eval<0,1,0>(vflag, buffers, fc);
} else {
if (force->newton_bond)
eval<1,1,1>(vflag, buffers, fc);
else
eval<1,1,0>(vflag, buffers, fc);
}
} else {
if (force->newton_bond)
eval<0,0,1>(vflag, buffers, fc);
else
eval<0,0,0>(vflag, buffers, fc);
}
}
template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void DihedralFourierIntel::eval(const int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc)
{
const int inum = neighbor->ndihedrallist;
if (inum == 0) return;
ATOM_T * _noalias const x = buffers->get_x(0);
const int nlocal = atom->nlocal;
const int nall = nlocal + atom->nghost;
int f_stride;
if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
else f_stride = buffers->get_stride(nlocal);
int tc;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
const int nthreads = tc;
acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oedihedral = (acc_t)0.0;
if (VFLAG && vflag) {
ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
}
#if defined(_OPENMP)
#pragma omp parallel default(none) \
shared(f_start,f_stride,fc) \
reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
#endif
{
int nfrom, npl, nto, tid;
#ifdef LMP_INTEL_USE_SIMDOFF
IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
#else
IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
#endif
FORCE_T * _noalias const f = f_start + (tid * f_stride);
if (fix->need_zero(tid))
memset(f, 0, f_stride * sizeof(FORCE_T));
const int5_t * _noalias const dihedrallist =
(int5_t *) neighbor->dihedrallist[0];
#ifdef LMP_INTEL_USE_SIMDOFF
acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
if (EFLAG) sedihedral = (acc_t)0.0;
if (VFLAG && vflag) {
sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
}
#pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
for (int n = nfrom; n < nto; n ++) {
#else
for (int n = nfrom; n < nto; n += npl) {
#endif
const int i1 = dihedrallist[n].a;
const int i2 = dihedrallist[n].b;
const int i3 = dihedrallist[n].c;
const int i4 = dihedrallist[n].d;
const int type = dihedrallist[n].t;
// 1st bond
const flt_t vb1x = x[i1].x - x[i2].x;
const flt_t vb1y = x[i1].y - x[i2].y;
const flt_t vb1z = x[i1].z - x[i2].z;
// 2nd bond
const flt_t vb2xm = x[i2].x - x[i3].x;
const flt_t vb2ym = x[i2].y - x[i3].y;
const flt_t vb2zm = x[i2].z - x[i3].z;
// 3rd bond
const flt_t vb3x = x[i4].x - x[i3].x;
const flt_t vb3y = x[i4].y - x[i3].y;
const flt_t vb3z = x[i4].z - x[i3].z;
// c,s calculation
const flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
const flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
const flt_t az = vb1x*vb2ym - vb1y*vb2xm;
const flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
const flt_t by = vb3z*vb2xm - vb3x*vb2zm;
const flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
const flt_t rasq = ax*ax + ay*ay + az*az;
const flt_t rbsq = bx*bx + by*by + bz*bz;
const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
const flt_t rg = sqrt(rgsq);
flt_t rginv, ra2inv, rb2inv;
rginv = ra2inv = rb2inv = (flt_t)0.0;
if (rg > 0) rginv = (flt_t)1.0/rg;
if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
const flt_t rabinv = sqrt(ra2inv*rb2inv);
flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
// error check
#ifndef LMP_INTEL_USE_SIMDOFF
if (c > PTOLERANCE || c < MTOLERANCE) {
int me = comm->me;
if (screen) {
char str[128];
sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT " "
TAGINT_FORMAT " " TAGINT_FORMAT,
me,tid,update->ntimestep,
atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
error->warning(FLERR,str,0);
fprintf(screen," 1st atom: %d %g %g %g\n",
me,x[i1].x,x[i1].y,x[i1].z);
fprintf(screen," 2nd atom: %d %g %g %g\n",
me,x[i2].x,x[i2].y,x[i2].z);
fprintf(screen," 3rd atom: %d %g %g %g\n",
me,x[i3].x,x[i3].y,x[i3].z);
fprintf(screen," 4th atom: %d %g %g %g\n",
me,x[i4].x,x[i4].y,x[i4].z);
}
}
#endif
if (c > (flt_t)1.0) c = (flt_t)1.0;
if (c < (flt_t)-1.0) c = (flt_t)-1.0;
flt_t deng;
flt_t df = (flt_t)0.0;
if (EFLAG) deng = (flt_t)0.0;
for (int j = 0; j < nterms[type]; j++) {
const flt_t tcos_shift = fc.bp[j][type].cos_shift;
const flt_t tsin_shift = fc.bp[j][type].sin_shift;
const flt_t tk = fc.bp[j][type].k;
const int m = fc.bp[j][type].multiplicity;
flt_t p = (flt_t)1.0;
flt_t ddf1, df1;
ddf1 = df1 = (flt_t)0.0;
for (int i = 0; i < m; i++) {
ddf1 = p*c - df1*s;
df1 = p*s + df1*c;
p = ddf1;
}
p = p*tcos_shift + df1*tsin_shift;
df1 = df1*tcos_shift - ddf1*tsin_shift;
df1 *= -m;
p += (flt_t)1.0;
if (m == 0) {
p = (flt_t)1.0 + tcos_shift;
df1 = (flt_t)0.0;
}
if (EFLAG) deng += tk * p;
df -= tk * df1;
}
const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
const flt_t fga = fg*ra2inv*rginv;
const flt_t hgb = hg*rb2inv*rginv;
const flt_t gaa = -ra2inv*rg;
const flt_t gbb = rb2inv*rg;
const flt_t dtfx = gaa*ax;
const flt_t dtfy = gaa*ay;
const flt_t dtfz = gaa*az;
const flt_t dtgx = fga*ax - hgb*bx;
const flt_t dtgy = fga*ay - hgb*by;
const flt_t dtgz = fga*az - hgb*bz;
const flt_t dthx = gbb*bx;
const flt_t dthy = gbb*by;
const flt_t dthz = gbb*bz;
const flt_t sx2 = df*dtgx;
const flt_t sy2 = df*dtgy;
const flt_t sz2 = df*dtgz;
flt_t f1x = df*dtfx;
flt_t f1y = df*dtfy;
flt_t f1z = df*dtfz;
const flt_t f2x = sx2 - f1x;
const flt_t f2y = sy2 - f1y;
const flt_t f2z = sz2 - f1z;
flt_t f4x = df*dthx;
flt_t f4y = df*dthy;
flt_t f4z = df*dthz;
const flt_t f3x = -sx2 - f4x;
const flt_t f3y = -sy2 - f4y;
const flt_t f3z = -sz2 - f4z;
if (EFLAG || VFLAG) {
#ifdef LMP_INTEL_USE_SIMDOFF
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
sv0, sv1, sv2, sv3, sv4, sv5);
#else
IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
ov0, ov1, ov2, ov3, ov4, ov5);
#endif
}
#ifdef LMP_INTEL_USE_SIMDOFF
#pragma simdoff
#endif
{
if (NEWTON_BOND || i1 < nlocal) {
f[i1].x += f1x;
f[i1].y += f1y;
f[i1].z += f1z;
}
if (NEWTON_BOND || i2 < nlocal) {
f[i2].x += f2x;
f[i2].y += f2y;
f[i2].z += f2z;
}
if (NEWTON_BOND || i3 < nlocal) {
f[i3].x += f3x;
f[i3].y += f3y;
f[i3].z += f3z;
}
if (NEWTON_BOND || i4 < nlocal) {
f[i4].x += f4x;
f[i4].y += f4y;
f[i4].z += f4z;
}
}
} // for n
#ifdef LMP_INTEL_USE_SIMDOFF
if (EFLAG) oedihedral += sedihedral;
if (VFLAG && vflag) {
ov0 += sv0; ov1 += sv1; ov2 += sv2;
ov3 += sv3; ov4 += sv4; ov5 += sv5;
}
#endif
} // omp parallel
if (EFLAG) energy += oedihedral;
if (VFLAG && vflag) {
virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
}
fix->set_reduce_flag();
}
/* ---------------------------------------------------------------------- */
void DihedralFourierIntel::init_style()
{
DihedralFourier::init_style();
int ifix = modify->find_fix("package_intel");
if (ifix < 0)
error->all(FLERR,
"The 'package intel' command is required for /intel styles");
fix = static_cast<FixIntel *>(modify->fix[ifix]);
#ifdef _LMP_INTEL_OFFLOAD
_use_base = 0;
if (fix->offload_balance() != 0.0) {
_use_base = 1;
return;
}
#endif
fix->bond_init_check();
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
pack_force_const(force_const_single, fix->get_mixed_buffers());
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
pack_force_const(force_const_double, fix->get_double_buffers());
else
pack_force_const(force_const_single, fix->get_single_buffers());
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void DihedralFourierIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers)
{
const int bp1 = atom->ndihedraltypes + 1;
fc.set_ntypes(bp1, setflag, nterms, memory);
for (int i = 1; i < bp1; i++) {
if (setflag[i]) {
for (int j = 0; j < nterms[i]; j++) {
fc.bp[j][i].cos_shift = cos_shift[i][j];
fc.bp[j][i].sin_shift = sin_shift[i][j];
fc.bp[j][i].k = k[i][j];
fc.bp[j][i].multiplicity = multiplicity[i][j];
}
}
}
}
/* ---------------------------------------------------------------------- */
template <class flt_t>
void DihedralFourierIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
int *setflag,
int *nterms,
Memory *memory) {
if (nbondtypes != _nbondtypes) {
if (_nbondtypes > 0)
_memory->destroy(bp);
if (nbondtypes > 0) {
_maxnterms = 1;
for (int i = 1; i <= nbondtypes; i++)
if (setflag[i]) _maxnterms = MAX(_maxnterms, nterms[i]);
_memory->create(bp, _maxnterms, nbondtypes, "dihedralfourierintel.bp");
}
}
_nbondtypes = nbondtypes;
_memory = memory;
}

View File

@ -0,0 +1,82 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
------------------------------------------------------------------------- */
#ifdef DIHEDRAL_CLASS
DihedralStyle(fourier/intel,DihedralFourierIntel)
#else
#ifndef LMP_DIHEDRAL_FOURIER_INTEL_H
#define LMP_DIHEDRAL_FOURIER_INTEL_H
#include "dihedral_fourier.h"
#include "fix_intel.h"
namespace LAMMPS_NS {
class DihedralFourierIntel : public DihedralFourier {
public:
DihedralFourierIntel(class LAMMPS *lmp);
virtual void compute(int, int);
void init_style();
private:
FixIntel *fix;
template <class flt_t> class ForceConst;
template <class flt_t, class acc_t>
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc);
template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc);
template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers);
#ifdef _LMP_INTEL_OFFLOAD
int _use_base;
#endif
template <class flt_t>
class ForceConst {
public:
typedef struct { flt_t cos_shift, sin_shift, k;
int multiplicity; } fc_packed1;
fc_packed1 **bp;
ForceConst() : _nbondtypes(0) {}
~ForceConst() { set_ntypes(0, NULL, NULL, NULL); }
void set_ntypes(const int nbondtypes, int *setflag, int *nterms,
Memory *memory);
private:
int _nbondtypes, _maxnterms;
Memory *_memory;
};
ForceConst<float> force_const_single;
ForceConst<double> force_const_double;
};
}
#endif
#endif

View File

@ -285,6 +285,7 @@ int FixIntel::setmask()
{
int mask = 0;
mask |= PRE_REVERSE;
mask |= MIN_PRE_REVERSE;
#ifdef _LMP_INTEL_OFFLOAD
mask |= POST_FORCE;
mask |= MIN_POST_FORCE;

View File

@ -43,6 +43,7 @@ class FixIntel : public Fix {
virtual int setmask();
virtual void init();
virtual void setup(int);
inline void min_setup(int in) { setup(in); }
void setup_pre_reverse(int eflag = 0, int vflag = 0);
void pair_init_check(const bool cdmessage=false);
@ -50,6 +51,8 @@ class FixIntel : public Fix {
void kspace_init_check();
void pre_reverse(int eflag = 0, int vflag = 0);
inline void min_pre_reverse(int eflag = 0, int vflag = 0)
{ pre_reverse(eflag, vflag); }
// Get all forces, calculation results from coprocesser
void sync_coprocessor();

View File

@ -409,6 +409,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0);
lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
#endif
memset(_ccachei, 0, vsize * sizeof(int));
memset(_ccachej, 0, vsize * sizeof(int));
#ifdef _LMP_INTEL_OFFLOAD
@ -425,7 +426,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
#pragma offload_transfer target(mic:_cop) \
nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
in(ccachei:length(vsize) alloc_if(1) free_if(0)) \
in(ccachej:length(vsize) alloc_if(1) free_if(0))
}
#ifdef LMP_USE_AVXCD

View File

@ -292,6 +292,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
ito = inum; \
}
#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum, \
nthr, vecsize) \
{ \
tid = 0; \
ifrom = 0; \
ip = 1; \
ito = inum; \
}
#endif
#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \

View File

@ -319,7 +319,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
const int bstart = binhead[ibin + binstart[k]];
const int bend = binhead[ibin + binend[k]];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = bstart; jj < bend; jj++)
@ -341,7 +340,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
const int bstart = binhead[ibin + stencil[k]];
const int bend = binhead[ibin + stencil[k] + 1];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = bstart; jj < bend; jj++)

View File

@ -273,7 +273,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
const int bstart = binhead[ibin + binstart[k]];
const int bend = binhead[ibin + binend[k]];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = bstart; jj < bend; jj++)
@ -307,7 +306,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
const int bstart = binhead[ibin];
const int bend = binhead[ibin + 1];
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd
#endif
for (int jj = bstart; jj < bend; jj++) {

View File

@ -0,0 +1,617 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
This software is distributed under the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
Shun Xu (Computer Network Information Center, CAS)
------------------------------------------------------------------------- */
#include <math.h>
#include "pair_dpd_intel.h"
#include "atom.h"
#include "comm.h"
#include "force.h"
#include "memory.h"
#include "modify.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "suffix.h"
using namespace LAMMPS_NS;
#define LMP_MKL_RNG VSL_BRNG_MT19937
#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
#define IEPSILON 1.0e10
/* ---------------------------------------------------------------------- */
PairDPDIntel::PairDPDIntel(LAMMPS *lmp) :
PairDPD(lmp)
{
suffix_flag |= Suffix::INTEL;
respa_enable = 0;
random_thread = NULL;
_nrandom_thread = 0;
}
/* ---------------------------------------------------------------------- */
PairDPDIntel::~PairDPDIntel()
{
#if defined(_OPENMP)
if (_nrandom_thread) {
#ifdef LMP_NO_MKL_RNG
for (int i = 1; i < _nrandom_thread; i++)
delete random_thread[i];
#else
for (int i = 0; i < _nrandom_thread; i++)
vslDeleteStream(&random_thread[i]);
#endif
}
#endif
delete []random_thread;
}
/* ---------------------------------------------------------------------- */
void PairDPDIntel::compute(int eflag, int vflag)
{
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
force_const_single);
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
compute<double,double>(eflag, vflag, fix->get_double_buffers(),
force_const_double);
else
compute<float,float>(eflag, vflag, fix->get_single_buffers(),
force_const_single);
fix->balance_stamp();
vflag_fdotr = 0;
}
template <class flt_t, class acc_t>
void PairDPDIntel::compute(int eflag, int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc)
{
if (eflag || vflag) {
ev_setup(eflag, vflag);
} else evflag = vflag_fdotr = 0;
const int inum = list->inum;
const int nthreads = comm->nthreads;
const int host_start = fix->host_start_pair();
const int offload_end = fix->offload_end_pair();
const int ago = neighbor->ago;
if (ago != 0 && fix->separate_buffers() == 0) {
fix->start_watch(TIME_PACK);
int packthreads;
if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
else packthreads = 1;
#if defined(_OPENMP)
#pragma omp parallel if(packthreads > 1)
#endif
{
int ifrom, ito, tid;
IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
packthreads, sizeof(ATOM_T));
buffers->thr_pack(ifrom,ito,ago);
}
fix->stop_watch(TIME_PACK);
}
int ovflag = 0;
if (vflag_fdotr) ovflag = 2;
else if (vflag) ovflag = 1;
if (_onetype) {
if (eflag) {
if (force->newton_pair) {
eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
}
} else {
if (force->newton_pair) {
eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
}
}
} else {
if (eflag) {
if (force->newton_pair) {
eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
}
} else {
if (force->newton_pair) {
eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
} else {
eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
}
}
}
}
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void PairDPDIntel::eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc,
const int astart, const int aend)
{
const int inum = aend - astart;
if (inum == 0) return;
int nlocal, nall, minlocal;
fix->get_buffern(offload, nlocal, nall, minlocal);
const int ago = neighbor->ago;
IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
ATOM_T * _noalias const x = buffers->get_x(offload);
typedef struct { double x, y, z; } lmp_vt;
lmp_vt *v = (lmp_vt *)atom->v[0];
const flt_t dtinvsqrt = 1.0/sqrt(update->dt);
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const FC_PACKED1_T * _noalias const param = fc.param[0];
const flt_t * _noalias const special_lj = fc.special_lj;
int * _noalias const rngi_thread = fc.rngi;
const int rng_size = buffers->get_max_nbors();
const int ntypes = atom->ntypes + 1;
const int eatom = this->eflag_atom;
// Determine how much data to transfer
int x_size, q_size, f_stride, ev_size, separate_flag;
IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
buffers, offload, fix, separate_flag,
x_size, q_size, ev_size, f_stride);
int tc;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
const int nthreads = tc;
int *overflow = fix->get_off_overflow_flag();
{
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime();
#endif
IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
f_stride, x, 0);
acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
if (EFLAG) oevdwl = (acc_t)0;
if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
// loop over neighbors of my atoms
#if defined(_OPENMP)
#pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
#endif
{
int iifrom, iip, iito, tid;
IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
iifrom += astart;
iito += astart;
#ifdef LMP_NO_MKL_RNG
RanMars *my_random = random_thread[tid];
#else
VSLStreamStatePtr *my_random = &(random_thread[tid]);
#endif
flt_t *my_rand_buffer = fc.rand_buffer_thread[tid];
int rngi = rngi_thread[tid];
int foff;
if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
else foff = -minlocal;
FORCE_T * _noalias const f = f_start + foff;
if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
flt_t icut, a0, gamma, sigma;
if (ONETYPE) {
icut = param[3].icut;
a0 = param[3].a0;
gamma = param[3].gamma;
sigma = param[3].sigma;
}
for (int i = iifrom; i < iito; i += iip) {
int itype, ptr_off;
const FC_PACKED1_T * _noalias parami;
if (!ONETYPE) {
itype = x[i].w;
ptr_off = itype * ntypes;
parami = param + ptr_off;
}
const int * _noalias const jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
acc_t fxtmp, fytmp, fztmp, fwtmp;
acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
const flt_t xtmp = x[i].x;
const flt_t ytmp = x[i].y;
const flt_t ztmp = x[i].z;
const flt_t vxtmp = v[i].x;
const flt_t vytmp = v[i].y;
const flt_t vztmp = v[i].z;
fxtmp = fytmp = fztmp = (acc_t)0;
if (EFLAG) fwtmp = sevdwl = (acc_t)0;
if (NEWTON_PAIR == 0)
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
if (rngi + jnum > rng_size) {
#ifdef LMP_NO_MKL_RNG
for (int jj = 0; jj < rngi; jj++)
my_rand_buffer[jj] = my_random->gaussian();
#else
if (sizeof(flt_t) == sizeof(float))
vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi,
(float*)my_rand_buffer, (float)0.0, (float)1.0 );
else
vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi,
(double*)my_rand_buffer, 0.0, 1.0 );
#endif
rngi = 0;
}
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int jj = 0; jj < jnum; jj++) {
flt_t forcelj, evdwl;
forcelj = evdwl = (flt_t)0.0;
int j, jtype, sbindex;
if (!ONETYPE) {
sbindex = jlist[jj] >> SBBITS & 3;
j = jlist[jj] & NEIGHMASK;
} else
j = jlist[jj];
const flt_t delx = xtmp - x[j].x;
const flt_t dely = ytmp - x[j].y;
const flt_t delz = ztmp - x[j].z;
if (!ONETYPE) {
jtype = x[j].w;
icut = parami[jtype].icut;
}
const flt_t rsq = delx * delx + dely * dely + delz * delz;
const flt_t rinv = (flt_t)1.0/sqrt(rsq);
if (rinv > icut) {
flt_t factor_dpd;
if (!ONETYPE) factor_dpd = special_lj[sbindex];
flt_t delvx = vxtmp - v[j].x;
flt_t delvy = vytmp - v[j].y;
flt_t delvz = vztmp - v[j].z;
flt_t dot = delx*delvx + dely*delvy + delz*delvz;
flt_t randnum = my_rand_buffer[jj];
flt_t iwd = rinv - icut;
if (rinv > (flt_t)IEPSILON) iwd = (flt_t)0.0;
if (!ONETYPE) {
a0 = parami[jtype].a0;
gamma = parami[jtype].gamma;
sigma = parami[jtype].sigma;
}
flt_t fpair = a0 - iwd * gamma * dot + sigma * randnum * dtinvsqrt;
if (!ONETYPE) fpair *= factor_dpd;
fpair *= iwd;
const flt_t fpx = fpair * delx;
fxtmp += fpx;
if (NEWTON_PAIR) f[j].x -= fpx;
const flt_t fpy = fpair * dely;
fytmp += fpy;
if (NEWTON_PAIR) f[j].y -= fpy;
const flt_t fpz = fpair * delz;
fztmp += fpz;
if (NEWTON_PAIR) f[j].z -= fpz;
if (EFLAG) {
flt_t cut = (flt_t)1.0/icut;
flt_t r = (flt_t)1.0/rinv;
evdwl = (flt_t)0.5 * a0 * (cut - (flt_t)2.0*r + rsq * icut);
if (!ONETYPE) evdwl *= factor_dpd;
sevdwl += evdwl;
if (eatom) {
fwtmp += (flt_t)0.5 * evdwl;
if (NEWTON_PAIR)
f[j].w += (flt_t)0.5 * evdwl;
}
}
if (NEWTON_PAIR == 0)
IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
} // if rsq
} // for jj
if (NEWTON_PAIR) {
f[i].x += fxtmp;
f[i].y += fytmp;
f[i].z += fztmp;
} else {
f[i].x = fxtmp;
f[i].y = fytmp;
f[i].z = fztmp;
}
IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
rngi += jnum;
} // for ii
IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
ov4, ov5);
rngi_thread[tid] = rngi;
} // end omp
IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
ov0, ov1, ov2, ov3, ov4, ov5);
if (EFLAG) {
if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
ev_global[0] = oevdwl;
ev_global[1] = (acc_t)0.0;
}
if (vflag) {
if (NEWTON_PAIR == 0) {
ov0 *= (acc_t)0.5;
ov1 *= (acc_t)0.5;
ov2 *= (acc_t)0.5;
ov3 *= (acc_t)0.5;
ov4 *= (acc_t)0.5;
ov5 *= (acc_t)0.5;
}
ev_global[2] = ov0;
ev_global[3] = ov1;
ev_global[4] = ov2;
ev_global[5] = ov3;
ev_global[6] = ov4;
ev_global[7] = ov5;
}
#if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
*timer_compute = MIC_Wtime() - *timer_compute;
#endif
} // end offload
if (offload)
fix->stop_watch(TIME_OFFLOAD_LATENCY);
else
fix->stop_watch(TIME_HOST_PAIR);
if (EFLAG || vflag)
fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
else
fix->add_result_array(f_start, 0, offload);
}
/* ----------------------------------------------------------------------
global settings
------------------------------------------------------------------------- */
void PairDPDIntel::settings(int narg, char **arg) {
#if defined(_OPENMP)
if (_nrandom_thread) {
#ifdef LMP_NO_MKL_RNG
for (int i = 1; i < _nrandom_thread; i++)
delete random_thread[i];
#else
for (int i = 0; i < _nrandom_thread; i++)
vslDeleteStream(&random_thread[i]);
#endif
}
delete []random_thread;
#endif
PairDPD::settings(narg,arg);
_nrandom_thread = comm->nthreads;
#ifdef LMP_NO_MKL_RNG
random_thread =new RanMars*[comm->nthreads];
random_thread[0] = random;
#if defined(_OPENMP)
#pragma omp parallel
{
int tid = omp_get_thread_num();
if (tid > 0)
random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
}
#endif
#else
random_thread=new VSLStreamStatePtr[comm->nthreads];
#if defined(_OPENMP)
#pragma omp parallel
{
int tid = omp_get_thread_num();
vslNewStream(&random_thread[tid], LMP_MKL_RNG,
seed + comm->me + comm->nprocs * tid );
}
#endif
#endif
}
/* ---------------------------------------------------------------------- */
void PairDPDIntel::init_style()
{
PairDPD::init_style();
if (force->newton_pair == 0) {
neighbor->requests[neighbor->nrequest-1]->half = 0;
neighbor->requests[neighbor->nrequest-1]->full = 1;
}
neighbor->requests[neighbor->nrequest-1]->intel = 1;
int ifix = modify->find_fix("package_intel");
if (ifix < 0)
error->all(FLERR,
"The 'package intel' command is required for /intel styles");
fix = static_cast<FixIntel *>(modify->fix[ifix]);
fix->pair_init_check();
#ifdef _LMP_INTEL_OFFLOAD
if (fix->offload_balance() != 0.0)
error->all(FLERR,
"Offload for dpd/intel is not yet available. Set balance to 0.");
#endif
if (fix->precision() == FixIntel::PREC_MODE_MIXED)
pack_force_const(force_const_single, fix->get_mixed_buffers());
else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
pack_force_const(force_const_double, fix->get_double_buffers());
else
pack_force_const(force_const_single, fix->get_single_buffers());
}
/* ---------------------------------------------------------------------- */
template <class flt_t, class acc_t>
void PairDPDIntel::pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t,acc_t> *buffers)
{
_onetype = 0;
if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
int tp1 = atom->ntypes + 1;
fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop);
buffers->set_ntypes(tp1);
flt_t **cutneighsq = buffers->get_cutneighsq();
// Repeat cutsq calculation because done after call to init_style
double cut, cutneigh;
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = i; j <= atom->ntypes; j++) {
if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
cut = init_one(i,j);
cutneigh = cut + neighbor->skin;
cutsq[i][j] = cutsq[j][i] = cut*cut;
cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
double icut = 1.0 / cut;
fc.param[i][j].icut = fc.param[j][i].icut = icut;
} else {
cut = init_one(i,j);
double icut = 1.0 / cut;
fc.param[i][j].icut = fc.param[j][i].icut = icut;
}
}
}
for (int i = 0; i < 4; i++) {
fc.special_lj[i] = force->special_lj[i];
fc.special_lj[0] = 1.0;
}
for (int i = 0; i < tp1; i++) {
for (int j = 0; j < tp1; j++) {
fc.param[i][j].a0 = a0[i][j];
fc.param[i][j].gamma = gamma[i][j];
fc.param[i][j].sigma = sigma[i][j];
}
}
}
/* ---------------------------------------------------------------------- */
template <class flt_t>
void PairDPDIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
const int nthreads,
const int max_nbors,
Memory *memory,
const int cop) {
if (ntypes != _ntypes) {
if (_ntypes > 0) {
_memory->destroy(param);
_memory->destroy(rand_buffer_thread);
_memory->destroy(rngi);
}
if (ntypes > 0) {
_cop = cop;
memory->create(param,ntypes,ntypes,"fc.param");
memory->create(rand_buffer_thread, nthreads, max_nbors,
"fc.rand_buffer_thread");
memory->create(rngi,nthreads,"fc.param");
for (int i = 0; i < nthreads; i++) rngi[i] = max_nbors;
}
}
_ntypes = ntypes;
_memory = memory;
}
/* ----------------------------------------------------------------------
proc 0 reads from restart file, bcasts
------------------------------------------------------------------------- */
void PairDPDIntel::read_restart_settings(FILE *fp)
{
#if defined(_OPENMP)
if (_nrandom_thread) {
#ifdef LMP_NO_MKL_RNG
for (int i = 1; i < _nrandom_thread; i++)
delete random_thread[i];
#else
for (int i = 0; i < _nrandom_thread; i++)
vslDeleteStream(&random_thread[i]);
#endif
}
delete []random_thread;
#endif
PairDPD::read_restart_settings(fp);
_nrandom_thread = comm->nthreads;
#ifdef LMP_NO_MKL_RNG
random_thread =new RanMars*[comm->nthreads];
random_thread[0] = random;
#if defined(_OPENMP)
#pragma omp parallel
{
int tid = omp_get_thread_num();
if (tid > 0)
random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
}
#endif
#else
random_thread=new VSLStreamStatePtr[comm->nthreads];
#if defined(_OPENMP)
#pragma omp parallel
{
int tid = omp_get_thread_num();
vslNewStream(&random_thread[tid], LMP_MKL_RNG,
seed + comm->me + comm->nprocs * tid );
}
#endif
#endif
}

View File

@ -0,0 +1,110 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: W. Michael Brown (Intel)
Shun Xu (Computer Network Information Center, CAS)
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(dpd/intel,PairDPDIntel)
#else
#ifndef LMP_PAIR_DPD_INTEL_H
#define LMP_PAIR_DPD_INTEL_H
#include "pair_dpd.h"
#include "fix_intel.h"
#ifdef LMP_NO_MKL_RNG
#include "random_mars.h"
#else
#include "mkl_vsl.h"
#endif
namespace LAMMPS_NS {
class PairDPDIntel : public PairDPD {
public:
PairDPDIntel(class LAMMPS *);
~PairDPDIntel();
virtual void compute(int, int);
void settings(int, char **);
void init_style();
void read_restart_settings(FILE *);
private:
FixIntel *fix;
int _cop, _onetype, _nrandom_thread;
#ifdef LMP_NO_MKL_RNG
RanMars **random_thread;
#else
VSLStreamStatePtr *random_thread;
#endif
template <class flt_t> class ForceConst;
template <class flt_t, class acc_t>
void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
const ForceConst<flt_t> &fc);
template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
void eval(const int offload, const int vflag,
IntelBuffers<flt_t,acc_t> * buffers,
const ForceConst<flt_t> &fc, const int astart, const int aend);
template <class flt_t, class acc_t>
void pack_force_const(ForceConst<flt_t> &fc,
IntelBuffers<flt_t, acc_t> *buffers);
// ----------------------------------------------------------------------
template <class flt_t>
class ForceConst {
public:
typedef struct { flt_t icut, a0, gamma, sigma; } fc_packed1;
_alignvar(flt_t special_lj[4],64);
fc_packed1 **param;
flt_t **rand_buffer_thread;
int *rngi;
ForceConst() : _ntypes(0) {}
~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
void set_ntypes(const int ntypes, const int nthreads, const int max_nbors,
Memory *memory, const int cop);
private:
int _ntypes, _cop;
Memory *_memory;
};
ForceConst<float> force_const_single;
ForceConst<double> force_const_double;
};
}
#endif
#endif
/* ERROR/WARNING messages:
E: The 'package intel' command is required for /intel styles
Self-explanatory.
*/