Adding pair style dpd/intel and dihedral style fourier/intel

Adding raw performance numbers for Skylake xeon server. Fixes for using older Intel compilers and compiling without OpenMP. Fix adding in hooks for using USER-INTEL w/ minimization.
2017-10-02 04:53:17 -07:00 · 2017-10-02 04:53:17 -07:00 · cf24dd0265
parent f2c1172741
commit cf24dd0265
17 changed files with 1345 additions and 19 deletions
--- a/doc/src/JPG/user_intel.png
+++ b/doc/src/JPG/user_intel.png
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@ -25,12 +25,12 @@ LAMMPS to run on the CPU cores and coprocessor cores simultaneously.
 [Currently Available USER-INTEL Styles:]

 Angle Styles: charmm, harmonic :ulb,l
-Bond Styles: fene, harmonic :l
+Bond Styles: fene, fourier, harmonic :l
 Dihedral Styles: charmm, harmonic, opls :l
 Fixes: nve, npt, nvt, nvt/sllod :l
 Improper Styles: cvff, harmonic :l
 Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, 
-buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, 
+buck, dpd, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, 
 lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
 sw, tersoff :l
 K-Space Styles: pppm, pppm/disp :l
@ -82,6 +82,10 @@ this order :l
 The {newton} setting applies to all atoms, not just atoms shared
 between MPI tasks :l
 Vectorization can change the order for adding pairwise forces :l
+Unless specified otherwise at build time, the random number 
+generator for dissipative particle dynamics uses the Mersenne 
+Twister generator (that should be more robust than the standard
+generator) :l
 :ule

 The precision mode (described below) used with the USER-INTEL
--- a/doc/src/dihedral_fourier.txt
+++ b/doc/src/dihedral_fourier.txt
@ -7,6 +7,7 @@
 :line

 dihedral_style fourier command :h3
+dihedral_style fourier/intel command :h3
 dihedral_style fourier/omp command :h3

 [Syntax:]
--- a/doc/src/pair_dpd.txt
+++ b/doc/src/pair_dpd.txt
@ -8,6 +8,7 @@

 pair_style dpd command :h3
 pair_style dpd/gpu command :h3
+pair_style dpd/intel command :h3
 pair_style dpd/omp command :h3
 pair_style dpd/tstat command :h3
 pair_style dpd/tstat/gpu command :h3
--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@ -30,14 +30,15 @@ be added or changed in the Makefile depending on the version:

 2017 update 2         - No changes needed
 2017 updates 3 or 4   - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
-2018 or newer         - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high 
+2018 inital release   - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
+2018u1 or newer       - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high 

 -----------------------------------------------------------------------------

 When using the suffix command with "intel", intel styles will be used if they
 exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP 
-USER-OMP styles will be used whenever USER-INTEL styles are not available. This
-allow for running most styles in LAMMPS with threading.
+is installed, USER-OMP styles will be used whenever USER-INTEL styles are not
+available. This allow for running most styles in LAMMPS with threading.

 -----------------------------------------------------------------------------

@ -52,6 +53,15 @@ need to be changed.

 -----------------------------------------------------------------------------

+The random number generator for Dissipative Particle Dynamics (DPD) in the 
+Intel package uses the Mersenne Twister pseudorandom number generator as 
+implemented in the Intel Math Kernel Library (MKL). This generator is faster
+and more robust with a significantly longer period than the default DPD
+generator. However, if MKL is not installed, the standard random number
+generator can be used by adding the compile flag "-DLMP_NO_MKL_RNG".
+
+-----------------------------------------------------------------------------
+
 In order to use offload to Intel(R) Xeon Phi(TM) coprocessors, the flag 
 -DLMP_INTEL_OFFLOAD should be set in the Makefile. Offload requires the use of 
 Intel compilers.
--- a/src/USER-INTEL/TEST/README
+++ b/src/USER-INTEL/TEST/README
@ -9,6 +9,7 @@
 # in.intel.tersoff -    Silicon benchmark with Tersoff
 # in.intel.water -      Coarse-grain water benchmark using Stillinger-Weber
 # in.intel.airebo -     Polyethelene benchmark with AIREBO
+# in.intel.dpd -        Dissipative Particle Dynamics
 #
 #############################################################################

@ -16,16 +17,17 @@
 # Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017
 #  - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi
 #
-#                     Xeon E5-2697v4     Xeon Phi 7250
+#                     Xeon E5-2697v4     Xeon Phi 7250    Xeon Gold 6148
 #                    
-# in.intel.lj -            199.5               282.3
-# in.intel.rhodo -          12.4                17.5
-# in.intel.lc -	            19.0                25.7
-# in.intel.eam -            59.4                92.8
-# in.intel.sw -	           132.4               161.9
-# in.intel.tersoff -        83.3               101.1
-# in.intel.water -          53.4                90.3
-# in.intel.airebo -          7.3                11.8
+# in.intel.lj -            199.5               282.3            317.3
+# in.intel.rhodo -          12.4                17.5             24.4
+# in.intel.lc -	            19.0                25.7             26.8
+# in.intel.eam -            59.4                92.8            105.6 
+# in.intel.sw -	           132.4               161.9            213.8
+# in.intel.tersoff -        83.3               101.1            109.6
+# in.intel.water -          53.4                90.3            105.5
+# in.intel.airebo -          7.3                11.8             17.6
+# in.intel.dpd -            74.5               100.4            148.1
 #
 #############################################################################

--- a/src/USER-INTEL/TEST/in.intel.dpd
+++ b/src/USER-INTEL/TEST/in.intel.dpd
@ -0,0 +1,48 @@
+# DPD benchmark
+
+variable        N index on      # Newton Setting
+variable	w index 10	# Warmup Timesteps
+variable	t index 4000	# Main Run Timesteps
+variable	m index 1	# Main Run Timestep Multiplier
+variable	n index 0	# Use NUMA Mapping for Multi-Node
+variable	p index 0	# Use Power Measurement
+
+variable	x index 4
+variable	y index 2
+variable	z index 2
+
+variable	xx equal 20*$x
+variable	yy equal 20*$y
+variable	zz equal 20*$z
+variable	rr equal floor($t*$m)
+
+newton          $N
+if "$n > 0"	then "processors * * * grid numa"
+
+units		lj
+atom_style	atomic
+comm_modify     mode single vel yes
+
+lattice		fcc 3.0
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+create_box	1 box
+create_atoms	1 box
+mass		1 1.0
+
+velocity	all create 1.0 87287 loop geom
+
+pair_style	dpd 1.0 1.0 928948
+pair_coeff	1 1 25.0 4.5
+
+neighbor	0.5 bin
+neigh_modify    delay 0 every 1
+
+fix		1 all nve
+timestep	0.04
+
+thermo			1000
+
+if "$p > 0"		then "run_style verlet/power"
+
+if "$w > 0"		then "run $w"
+run    	 ${rr}
--- a/src/USER-INTEL/dihedral_fourier_intel.cpp
+++ b/src/USER-INTEL/dihedral_fourier_intel.cpp
@ -0,0 +1,441 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <math.h>
+#include "dihedral_fourier_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "pair.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define PTOLERANCE (flt_t)1.05
+#define MTOLERANCE (flt_t)-1.05
+typedef struct { int a,b,c,d,t;  } int5_t;
+
+/* ---------------------------------------------------------------------- */
+
+DihedralFourierIntel::DihedralFourierIntel(class LAMMPS *lmp)
+  : DihedralFourier(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralFourierIntel::compute(int eflag, int vflag)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_use_base) {
+    DihedralFourier::compute(eflag, vflag);
+    return;
+  }
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void DihedralFourierIntel::compute(int eflag, int vflag,
+				   IntelBuffers<flt_t,acc_t> *buffers,
+				   const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  if (evflag) {
+    if (vflag && !eflag) {
+      if (force->newton_bond)
+        eval<0,1,1>(vflag, buffers, fc);
+      else
+        eval<0,1,0>(vflag, buffers, fc);
+    } else {
+      if (force->newton_bond)
+        eval<1,1,1>(vflag, buffers, fc);
+      else
+        eval<1,1,0>(vflag, buffers, fc);
+    }
+  } else {
+    if (force->newton_bond)
+      eval<0,0,1>(vflag, buffers, fc);
+    else
+      eval<0,0,0>(vflag, buffers, fc);
+  }
+}
+
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void DihedralFourierIntel::eval(const int vflag,
+				IntelBuffers<flt_t,acc_t> *buffers,
+				const ForceConst<flt_t> &fc)
+
+{
+  const int inum = neighbor->ndihedrallist;
+  if (inum == 0) return;
+
+  ATOM_T * _noalias const x = buffers->get_x(0);
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  int f_stride;
+  if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
+  else f_stride = buffers->get_stride(nlocal);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+
+  acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
+  if (EFLAG) oedihedral = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+  }
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(f_start,f_stride,fc)           \
+    reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
+  #endif
+  {
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
+
+    FORCE_T * _noalias const f = f_start + (tid * f_stride);
+    if (fix->need_zero(tid))
+      memset(f, 0, f_stride * sizeof(FORCE_T));
+
+    const int5_t * _noalias const dihedrallist =
+      (int5_t *) neighbor->dihedrallist[0];
+
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
+    if (EFLAG) sedihedral = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+    }
+    #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+    for (int n = nfrom; n < nto; n ++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
+      const int i1 = dihedrallist[n].a;
+      const int i2 = dihedrallist[n].b;
+      const int i3 = dihedrallist[n].c;
+      const int i4 = dihedrallist[n].d;
+      const int type = dihedrallist[n].t;
+
+      // 1st bond
+
+      const flt_t vb1x = x[i1].x - x[i2].x;
+      const flt_t vb1y = x[i1].y - x[i2].y;
+      const flt_t vb1z = x[i1].z - x[i2].z;
+
+      // 2nd bond
+
+      const flt_t vb2xm = x[i2].x - x[i3].x;
+      const flt_t vb2ym = x[i2].y - x[i3].y;
+      const flt_t vb2zm = x[i2].z - x[i3].z;
+
+      // 3rd bond
+
+      const flt_t vb3x = x[i4].x - x[i3].x;
+      const flt_t vb3y = x[i4].y - x[i3].y;
+      const flt_t vb3z = x[i4].z - x[i3].z;
+
+      // c,s calculation
+
+      const flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
+      const flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
+      const flt_t az = vb1x*vb2ym - vb1y*vb2xm;
+      const flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
+      const flt_t by = vb3z*vb2xm - vb3x*vb2zm;
+      const flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
+
+      const flt_t rasq = ax*ax + ay*ay + az*az;
+      const flt_t rbsq = bx*bx + by*by + bz*bz;
+      const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
+      const flt_t rg = sqrt(rgsq);
+
+      flt_t rginv, ra2inv, rb2inv;
+      rginv = ra2inv = rb2inv = (flt_t)0.0;
+      if (rg > 0) rginv = (flt_t)1.0/rg;
+      if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
+      if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
+      const flt_t rabinv = sqrt(ra2inv*rb2inv);
+
+      flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
+      const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
+
+      // error check
+      #ifndef LMP_INTEL_USE_SIMDOFF
+      if (c > PTOLERANCE || c < MTOLERANCE) {
+        int me = comm->me;
+
+        if (screen) {
+          char str[128];
+          sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT,
+                  me,tid,update->ntimestep,
+                  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+          error->warning(FLERR,str,0);
+          fprintf(screen,"  1st atom: %d %g %g %g\n",
+                  me,x[i1].x,x[i1].y,x[i1].z);
+          fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                  me,x[i2].x,x[i2].y,x[i2].z);
+          fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                  me,x[i3].x,x[i3].y,x[i3].z);
+          fprintf(screen,"  4th atom: %d %g %g %g\n",
+                  me,x[i4].x,x[i4].y,x[i4].z);
+        }
+      }
+      #endif
+
+      if (c > (flt_t)1.0) c = (flt_t)1.0;
+      if (c < (flt_t)-1.0) c = (flt_t)-1.0;
+
+      flt_t deng;
+      flt_t df = (flt_t)0.0;
+      if (EFLAG) deng = (flt_t)0.0;
+      
+      for (int j = 0; j < nterms[type]; j++) {
+	const flt_t tcos_shift = fc.bp[j][type].cos_shift;
+	const flt_t tsin_shift = fc.bp[j][type].sin_shift;
+	const flt_t tk = fc.bp[j][type].k;
+	const int m = fc.bp[j][type].multiplicity;
+
+	flt_t p = (flt_t)1.0;
+	flt_t ddf1, df1;
+	ddf1 = df1 = (flt_t)0.0;
+
+	for (int i = 0; i < m; i++) {
+	  ddf1 = p*c - df1*s;
+	  df1 = p*s + df1*c;
+	  p = ddf1;
+	}
+
+	p = p*tcos_shift + df1*tsin_shift;
+	df1 = df1*tcos_shift - ddf1*tsin_shift;
+	df1 *= -m;
+	p += (flt_t)1.0;
+	
+	if (m == 0) {
+	  p = (flt_t)1.0 + tcos_shift;
+	  df1 = (flt_t)0.0;
+	}
+
+        if (EFLAG) deng += tk * p;
+        df -= tk * df1;
+      }
+
+      const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
+      const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
+      const flt_t fga = fg*ra2inv*rginv;
+      const flt_t hgb = hg*rb2inv*rginv;
+      const flt_t gaa = -ra2inv*rg;
+      const flt_t gbb = rb2inv*rg;
+
+      const flt_t dtfx = gaa*ax;
+      const flt_t dtfy = gaa*ay;
+      const flt_t dtfz = gaa*az;
+      const flt_t dtgx = fga*ax - hgb*bx;
+      const flt_t dtgy = fga*ay - hgb*by;
+      const flt_t dtgz = fga*az - hgb*bz;
+      const flt_t dthx = gbb*bx;
+      const flt_t dthy = gbb*by;
+      const flt_t dthz = gbb*bz;
+
+      const flt_t sx2 = df*dtgx;
+      const flt_t sy2 = df*dtgy;
+      const flt_t sz2 = df*dtgz;
+
+      flt_t f1x = df*dtfx;
+      flt_t f1y = df*dtfy;
+      flt_t f1z = df*dtfz;
+
+      const flt_t f2x = sx2 - f1x;
+      const flt_t f2y = sy2 - f1y;
+      const flt_t f2z = sz2 - f1z;
+
+      flt_t f4x = df*dthx;
+      flt_t f4y = df*dthy;
+      flt_t f4z = df*dthz;
+
+      const flt_t f3x = -sx2 - f4x;
+      const flt_t f3y = -sy2 - f4y;
+      const flt_t f3z = -sz2 - f4z;
+
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
+                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
+                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
+                              vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
+                              sv0, sv1, sv2, sv3, sv4, sv5);
+        #else
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
+                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
+                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
+                              vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
+                              ov0, ov1, ov2, ov3, ov4, ov5);
+        #endif
+      }
+
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+          f[i1].x += f1x;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
+        }
+
+        if (NEWTON_BOND || i2 < nlocal) {
+          f[i2].x += f2x;
+          f[i2].y += f2y;
+          f[i2].z += f2z;
+        }
+
+        if (NEWTON_BOND || i3 < nlocal) {
+          f[i3].x += f3x;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
+        }
+
+        if (NEWTON_BOND || i4 < nlocal) {
+          f[i4].x += f4x;
+          f[i4].y += f4y;
+          f[i4].z += f4z;
+        }
+      }
+    } // for n
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oedihedral += sedihedral;
+    if (VFLAG && vflag) {
+        ov0 += sv0; ov1 += sv1; ov2 += sv2;
+        ov3 += sv3; ov4 += sv4; ov5 += sv5;
+    }
+    #endif
+  } // omp parallel
+
+  if (EFLAG) energy += oedihedral;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
+  }
+
+  fix->set_reduce_flag();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralFourierIntel::init_style()
+{
+  DihedralFourier::init_style();
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  _use_base = 0;
+  if (fix->offload_balance() != 0.0) {
+    _use_base = 1;
+    return;
+  }
+  #endif
+
+  fix->bond_init_check();
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void DihedralFourierIntel::pack_force_const(ForceConst<flt_t> &fc,
+					    IntelBuffers<flt_t,acc_t> *buffers)
+{
+  const int bp1 = atom->ndihedraltypes + 1;
+  fc.set_ntypes(bp1, setflag, nterms, memory);
+
+  for (int i = 1; i < bp1; i++) {
+    if (setflag[i]) {
+      for (int j = 0; j < nterms[i]; j++) {
+        fc.bp[j][i].cos_shift = cos_shift[i][j];
+	fc.bp[j][i].sin_shift = sin_shift[i][j];
+	fc.bp[j][i].k = k[i][j];
+	fc.bp[j][i].multiplicity = multiplicity[i][j];
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void DihedralFourierIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
+                                                         int *setflag,
+							 int *nterms,
+							 Memory *memory) {
+  if (nbondtypes != _nbondtypes) {
+    if (_nbondtypes > 0)
+      _memory->destroy(bp);
+
+    if (nbondtypes > 0) {
+      _maxnterms = 1;
+      for (int i = 1; i <= nbondtypes; i++)
+        if (setflag[i]) _maxnterms = MAX(_maxnterms, nterms[i]);
+
+      _memory->create(bp, _maxnterms, nbondtypes, "dihedralfourierintel.bp");
+    }
+  }
+  _nbondtypes = nbondtypes;
+  _memory = memory;
+}
--- a/src/USER-INTEL/dihedral_fourier_intel.h
+++ b/src/USER-INTEL/dihedral_fourier_intel.h
@ -0,0 +1,82 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+
+DihedralStyle(fourier/intel,DihedralFourierIntel)
+
+#else
+
+#ifndef LMP_DIHEDRAL_FOURIER_INTEL_H
+#define LMP_DIHEDRAL_FOURIER_INTEL_H
+
+#include "dihedral_fourier.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class DihedralFourierIntel : public DihedralFourier {
+
+ public:
+  DihedralFourierIntel(class LAMMPS *lmp);
+  virtual void compute(int, int);
+  void init_style();
+
+ private:
+  FixIntel *fix;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  int _use_base;
+  #endif
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t cos_shift, sin_shift, k; 
+      int multiplicity; } fc_packed1;
+
+    fc_packed1 **bp;
+
+    ForceConst() : _nbondtypes(0)  {}
+    ~ForceConst() { set_ntypes(0, NULL, NULL, NULL); }
+
+    void set_ntypes(const int nbondtypes, int *setflag, int *nterms, 
+		    Memory *memory);
+
+   private:
+    int _nbondtypes, _maxnterms;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@ -285,6 +285,7 @@ int FixIntel::setmask()
 {
  int mask = 0;
  mask |= PRE_REVERSE;
+  mask |= MIN_PRE_REVERSE;
  #ifdef _LMP_INTEL_OFFLOAD
  mask |= POST_FORCE;
  mask |= MIN_POST_FORCE;
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@ -43,6 +43,7 @@ class FixIntel : public Fix {
  virtual int setmask();
  virtual void init();
  virtual void setup(int);
+  inline void min_setup(int in) { setup(in); }
  void setup_pre_reverse(int eflag = 0, int vflag = 0);

  void pair_init_check(const bool cdmessage=false);
@ -50,6 +51,8 @@ class FixIntel : public Fix {
  void kspace_init_check();

  void pre_reverse(int eflag = 0, int vflag = 0);
+  inline void min_pre_reverse(int eflag = 0, int vflag = 0)
+    { pre_reverse(eflag, vflag); }

  // Get all forces, calculation results from coprocesser
  void sync_coprocessor();
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@ -409,6 +409,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
  IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0);
  lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
  #endif
+  memset(_ccachei, 0, vsize * sizeof(int));
  memset(_ccachej, 0, vsize * sizeof(int));

  #ifdef _LMP_INTEL_OFFLOAD
@ -425,7 +426,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
      #pragma offload_transfer target(mic:_cop) \
        nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
        nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
-        nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
+        in(ccachei:length(vsize) alloc_if(1) free_if(0)) \
        in(ccachej:length(vsize) alloc_if(1) free_if(0))
    }
    #ifdef LMP_USE_AVXCD
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@ -292,6 +292,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
    ito = inum;                                                 \
  }

+#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum,     \
+                                 nthr, vecsize)                 \
+  {                                                             \
+    tid = 0;							\
+    ifrom = 0;							\
+    ip = 1;							\
+    ito = inum;							\
+  }
+
 #endif

 #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
--- a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
@ -319,7 +319,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
 	      const int bstart = binhead[ibin + binstart[k]];
 	      const int bend = binhead[ibin + binend[k]];
              #if defined(LMP_SIMD_COMPILER)
-              #pragma vector aligned
              #pragma simd
              #endif
              for (int jj = bstart; jj < bend; jj++)
@ -341,7 +340,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
 	      const int bstart = binhead[ibin + stencil[k]];
 	      const int bend = binhead[ibin + stencil[k] + 1];
              #if defined(LMP_SIMD_COMPILER)
-              #pragma vector aligned
              #pragma simd
              #endif
              for (int jj = bstart; jj < bend; jj++)
--- a/src/USER-INTEL/npair_intel.cpp
+++ b/src/USER-INTEL/npair_intel.cpp
@ -273,7 +273,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
            const int bstart = binhead[ibin + binstart[k]];
            const int bend = binhead[ibin + binend[k]];
            #if defined(LMP_SIMD_COMPILER)
-            #pragma vector aligned
            #pragma simd
            #endif
            for (int jj = bstart; jj < bend; jj++)
@ -307,7 +306,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
            const int bstart = binhead[ibin];
            const int bend = binhead[ibin + 1];
            #if defined(LMP_SIMD_COMPILER)
-            #pragma vector aligned
            #pragma simd
            #endif
            for (int jj = bstart; jj < bend; jj++) {
--- a/src/USER-INTEL/pair_dpd_intel.cpp
+++ b/src/USER-INTEL/pair_dpd_intel.cpp
@ -0,0 +1,617 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+                        Shun Xu (Computer Network Information Center, CAS)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include "pair_dpd_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define LMP_MKL_RNG VSL_BRNG_MT19937
+#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
+#define IEPSILON 1.0e10
+
+/* ---------------------------------------------------------------------- */
+
+PairDPDIntel::PairDPDIntel(LAMMPS *lmp) :
+  PairDPD(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  respa_enable = 0;
+  random_thread = NULL;
+  _nrandom_thread = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairDPDIntel::~PairDPDIntel()
+{
+  #if defined(_OPENMP)
+  if (_nrandom_thread) {
+    #ifdef LMP_NO_MKL_RNG
+    for (int i = 1; i < _nrandom_thread; i++)
+      delete random_thread[i];
+    #else
+    for (int i = 0; i < _nrandom_thread; i++)
+      vslDeleteStream(&random_thread[i]);
+    #endif
+  }
+  #endif
+  delete []random_thread;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairDPDIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+template <class flt_t, class acc_t>
+void PairDPDIntel::compute(int eflag, int vflag,
+                           IntelBuffers<flt_t,acc_t> *buffers,
+                           const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag, vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
+    #if defined(_OPENMP)
+    #pragma omp parallel if(packthreads > 1)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+                                packthreads, sizeof(ATOM_T));
+      buffers->thr_pack(ifrom,ito,ago);
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (_onetype) {
+    if (eflag) {
+      if (force->newton_pair) {
+        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    } else {
+      if (force->newton_pair) {
+        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    }
+  } else {
+    if (eflag) {
+      if (force->newton_pair) {
+        eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    } else {
+      if (force->newton_pair) {
+        eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    }
+  }
+}
+
+template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+void PairDPDIntel::eval(const int offload, const int vflag,
+                        IntelBuffers<flt_t,acc_t> *buffers,
+                        const ForceConst<flt_t> &fc,
+                        const int astart, const int aend)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  typedef struct { double x, y, z; } lmp_vt;
+  lmp_vt *v = (lmp_vt *)atom->v[0];
+  const flt_t dtinvsqrt = 1.0/sqrt(update->dt);
+
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const FC_PACKED1_T * _noalias const param = fc.param[0];
+  const flt_t * _noalias const special_lj = fc.special_lj;
+  int * _noalias const rngi_thread = fc.rngi;
+  const int rng_size = buffers->get_max_nbors();
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+  int *overflow = fix->get_off_overflow_flag();
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
+                              f_stride, x, 0);
+
+    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EFLAG) oevdwl = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      #ifdef LMP_NO_MKL_RNG
+      RanMars *my_random = random_thread[tid];
+      #else
+      VSLStreamStatePtr *my_random = &(random_thread[tid]);
+      #endif
+      flt_t *my_rand_buffer = fc.rand_buffer_thread[tid];
+      int rngi = rngi_thread[tid];
+
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+
+      flt_t icut, a0, gamma, sigma;
+      if (ONETYPE) {
+        icut = param[3].icut;
+        a0 = param[3].a0;
+        gamma = param[3].gamma;
+        sigma = param[3].sigma;
+      }
+      for (int i = iifrom; i < iito; i += iip) {
+        int itype, ptr_off;
+        const FC_PACKED1_T * _noalias parami;
+        if (!ONETYPE) {
+          itype = x[i].w;
+          ptr_off = itype * ntypes;
+          parami = param + ptr_off;
+        }
+
+        const int * _noalias const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        acc_t fxtmp, fytmp, fztmp, fwtmp;
+        acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+	const flt_t vxtmp = v[i].x;
+	const flt_t vytmp = v[i].y;
+	const flt_t vztmp = v[i].z;
+        fxtmp = fytmp = fztmp = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+
+	if (rngi + jnum > rng_size) {
+          #ifdef LMP_NO_MKL_RNG
+          for (int jj = 0; jj < rngi; jj++)
+            my_rand_buffer[jj] = my_random->gaussian();
+          #else
+	  if (sizeof(flt_t) == sizeof(float))
+	    vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, 
+			  (float*)my_rand_buffer, (float)0.0, (float)1.0 );
+	  else
+	    vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, 
+	  		  (double*)my_rand_buffer, 0.0, 1.0 );
+          #endif
+	  rngi = 0;
+	}
+
+        #if defined(LMP_SIMD_COMPILER)
+	#pragma vector aligned
+	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+	                         sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          flt_t forcelj, evdwl;
+          forcelj = evdwl = (flt_t)0.0;
+
+          int j, jtype, sbindex;
+          if (!ONETYPE) {
+            sbindex = jlist[jj] >> SBBITS & 3;
+            j = jlist[jj] & NEIGHMASK;
+          } else
+            j = jlist[jj];
+
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          if (!ONETYPE) {
+            jtype = x[j].w;
+            icut = parami[jtype].icut;
+          }
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+	  const flt_t rinv = (flt_t)1.0/sqrt(rsq);
+
+          if (rinv > icut) {
+            flt_t factor_dpd;
+            if (!ONETYPE) factor_dpd = special_lj[sbindex];
+
+	    flt_t delvx = vxtmp - v[j].x;
+	    flt_t delvy = vytmp - v[j].y;
+	    flt_t delvz = vztmp - v[j].z;
+	    flt_t dot = delx*delvx + dely*delvy + delz*delvz;
+	    flt_t randnum = my_rand_buffer[jj];
+
+	    flt_t iwd = rinv - icut;
+	    if (rinv > (flt_t)IEPSILON) iwd = (flt_t)0.0;
+
+	    if (!ONETYPE) {
+	      a0 = parami[jtype].a0;
+	      gamma = parami[jtype].gamma;
+	      sigma = parami[jtype].sigma;
+	    }
+	    flt_t fpair = a0 - iwd * gamma * dot + sigma * randnum * dtinvsqrt;
+	    if (!ONETYPE) fpair *= factor_dpd;
+	    fpair *= iwd;
+
+            const flt_t fpx = fpair * delx;
+            fxtmp += fpx;
+            if (NEWTON_PAIR) f[j].x -= fpx;
+            const flt_t fpy = fpair * dely;
+            fytmp += fpy;
+            if (NEWTON_PAIR) f[j].y -= fpy;
+            const flt_t fpz = fpair * delz;
+            fztmp += fpz;
+            if (NEWTON_PAIR) f[j].z -= fpz;
+
+            if (EFLAG) {
+	      flt_t cut = (flt_t)1.0/icut;
+	      flt_t r = (flt_t)1.0/rinv;
+	      evdwl = (flt_t)0.5 * a0 * (cut - (flt_t)2.0*r + rsq * icut);
+	      if (!ONETYPE) evdwl *= factor_dpd;
+              sevdwl += evdwl;
+              if (eatom) {
+                fwtmp += (flt_t)0.5 * evdwl;
+                if (NEWTON_PAIR)
+                  f[j].w += (flt_t)0.5 * evdwl;
+              }
+            }
+
+            if (NEWTON_PAIR == 0)
+              IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
+          } // if rsq
+        } // for jj
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
+
+        IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
+	rngi += jnum;
+      } // for ii
+
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
+      rngi_thread[tid] = rngi;
+    } // end omp
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
+      }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
+    }
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EFLAG || vflag)
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+   ------------------------------------------------------------------------- */
+
+void PairDPDIntel::settings(int narg, char **arg) {
+  #if defined(_OPENMP)
+  if (_nrandom_thread) {
+    #ifdef LMP_NO_MKL_RNG
+    for (int i = 1; i < _nrandom_thread; i++)
+      delete random_thread[i];
+    #else
+    for (int i = 0; i < _nrandom_thread; i++)
+      vslDeleteStream(&random_thread[i]);
+    #endif
+  }
+  delete []random_thread;
+  #endif
+  PairDPD::settings(narg,arg);
+  _nrandom_thread = comm->nthreads;
+
+  #ifdef LMP_NO_MKL_RNG
+
+  random_thread =new RanMars*[comm->nthreads];
+  random_thread[0] = random;
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    if (tid > 0)
+      random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
+  }
+  #endif
+
+  #else
+
+  random_thread=new VSLStreamStatePtr[comm->nthreads];
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    vslNewStream(&random_thread[tid], LMP_MKL_RNG, 
+		 seed + comm->me + comm->nprocs * tid );
+  }
+  #endif
+
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairDPDIntel::init_style()
+{
+  PairDPD::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  fix->pair_init_check();
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->offload_balance() != 0.0)
+    error->all(FLERR,
+          "Offload for dpd/intel is not yet available. Set balance to 0.");
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void PairDPDIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                    IntelBuffers<flt_t,acc_t> *buffers)
+{
+  _onetype = 0;
+  if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
+
+  int tp1 = atom->ntypes + 1;
+  fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+        double icut = 1.0 / cut;
+        fc.param[i][j].icut = fc.param[j][i].icut = icut;
+      } else {
+        cut = init_one(i,j);
+        double icut = 1.0 / cut;
+        fc.param[i][j].icut = fc.param[j][i].icut = icut;
+      }
+    }
+  }
+
+  for (int i = 0; i < 4; i++) {
+    fc.special_lj[i] = force->special_lj[i];
+    fc.special_lj[0] = 1.0;
+  }
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.param[i][j].a0 = a0[i][j];
+      fc.param[i][j].gamma = gamma[i][j];
+      fc.param[i][j].sigma = sigma[i][j];
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairDPDIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
+                                                 const int nthreads,
+						 const int max_nbors,
+                                                 Memory *memory,
+                                                 const int cop) {
+  if (ntypes != _ntypes) {
+    if (_ntypes > 0) {
+      _memory->destroy(param);
+      _memory->destroy(rand_buffer_thread);
+      _memory->destroy(rngi);
+    }
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(param,ntypes,ntypes,"fc.param");
+      memory->create(rand_buffer_thread, nthreads, max_nbors, 
+		     "fc.rand_buffer_thread");
+      memory->create(rngi,nthreads,"fc.param");
+      for (int i = 0; i < nthreads; i++) rngi[i] = max_nbors;
+    }
+  }
+  _ntypes = ntypes;
+  _memory = memory;
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+   ------------------------------------------------------------------------- */
+
+void PairDPDIntel::read_restart_settings(FILE *fp)
+{
+  #if defined(_OPENMP)
+  if (_nrandom_thread) {
+    #ifdef LMP_NO_MKL_RNG
+    for (int i = 1; i < _nrandom_thread; i++)
+      delete random_thread[i];
+    #else
+    for (int i = 0; i < _nrandom_thread; i++)
+      vslDeleteStream(&random_thread[i]);
+    #endif
+  }
+  delete []random_thread;
+  #endif
+  PairDPD::read_restart_settings(fp);
+  _nrandom_thread = comm->nthreads;
+
+  #ifdef LMP_NO_MKL_RNG
+
+  random_thread =new RanMars*[comm->nthreads];
+  random_thread[0] = random;
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    if (tid > 0)
+      random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
+  }
+  #endif
+
+  #else
+
+  random_thread=new VSLStreamStatePtr[comm->nthreads];
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    vslNewStream(&random_thread[tid], LMP_MKL_RNG, 
+		 seed + comm->me + comm->nprocs * tid );
+  }
+  #endif
+
+  #endif
+}
--- a/src/USER-INTEL/pair_dpd_intel.h
+++ b/src/USER-INTEL/pair_dpd_intel.h
@ -0,0 +1,110 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+                        Shun Xu (Computer Network Information Center, CAS)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(dpd/intel,PairDPDIntel)
+
+#else
+
+#ifndef LMP_PAIR_DPD_INTEL_H
+#define LMP_PAIR_DPD_INTEL_H
+
+#include "pair_dpd.h"
+#include "fix_intel.h"
+
+#ifdef LMP_NO_MKL_RNG
+#include "random_mars.h"
+#else
+#include "mkl_vsl.h"
+#endif
+
+namespace LAMMPS_NS {
+
+class PairDPDIntel : public PairDPD {
+
+ public:
+  PairDPDIntel(class LAMMPS *);
+  ~PairDPDIntel();
+
+  virtual void compute(int, int);
+  void settings(int, char **);
+  void init_style();
+  void read_restart_settings(FILE *);
+ 
+ private:
+  FixIntel *fix;
+  int _cop, _onetype, _nrandom_thread;
+
+  #ifdef LMP_NO_MKL_RNG
+  RanMars **random_thread;
+  #else
+  VSLStreamStatePtr *random_thread;
+  #endif
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  // ----------------------------------------------------------------------
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t icut, a0, gamma, sigma; } fc_packed1;
+
+    _alignvar(flt_t special_lj[4],64);
+    fc_packed1 **param;
+    flt_t **rand_buffer_thread;
+    int *rngi;
+
+    ForceConst() : _ntypes(0)  {}
+    ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
+
+    void set_ntypes(const int ntypes, const int nthreads, const int max_nbors, 
+		    Memory *memory, const int cop);
+
+   private:
+    int _ntypes, _cop;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+*/