git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15166 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2016-06-14 16:33:40 +00:00 · 2016-06-14 16:33:40 +00:00 · 2dbcfdc70c
parent ecffbbe531
commit 2dbcfdc70c
8 changed files with 893 additions and 8 deletions
--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@ -6,6 +6,7 @@
             W. Michael Brown (Intel) michael.w.brown at intel.com
                   Rodrigo Canales (RWTH Aachen University)
                  Markus H<>hnerbach (RWTH Aachen University)
+                           Stan Moore (Sandia)
 		   Ahmed E. Ismail (RWTH Aachen University)
                   Paolo Bientinesi (RWTH Aachen University)
                          Anupama Kurpad (Intel)
--- a/src/USER-INTEL/bond_fene_intel.cpp
+++ b/src/USER-INTEL/bond_fene_intel.cpp
@ -0,0 +1,295 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Stan Moore (Sandia)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include <stdlib.h>
+#include "bond_fene_intel.h"
+#include "atom.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "suffix.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+typedef struct { int a,b,t;  } int3_t;
+
+/* ---------------------------------------------------------------------- */
+
+BondFENEIntel::BondFENEIntel(LAMMPS *lmp) : BondFENE(lmp) 
+{
+  suffix_flag |= Suffix::INTEL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+BondFENEIntel::~BondFENEIntel()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BondFENEIntel::compute(int eflag, int vflag)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_use_base) {
+    BondFENE::compute(eflag, vflag);
+    return;
+  }
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void BondFENEIntel::compute(int eflag, int vflag,
+				IntelBuffers<flt_t,acc_t> *buffers,
+				const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = 0;
+
+  if (evflag) {
+    if (eflag) {
+      if (force->newton_bond)
+	eval<1,1,1>(vflag, buffers, fc);
+      else
+	eval<1,1,0>(vflag, buffers, fc);
+    } else {
+      if (force->newton_bond)
+	eval<1,0,1>(vflag, buffers, fc);
+      else
+	eval<1,0,0>(vflag, buffers, fc);
+    }
+  } else {
+    if (force->newton_bond)
+      eval<0,0,1>(vflag, buffers, fc);
+    else
+      eval<0,0,0>(vflag, buffers, fc);
+  }
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void BondFENEIntel::eval(const int vflag, 
+			     IntelBuffers<flt_t,acc_t> *buffers,
+			     const ForceConst<flt_t> &fc)
+{
+  const int inum = neighbor->nbondlist;
+  if (inum == 0) return;
+
+  ATOM_T * _noalias const x = buffers->get_x(0);
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  int f_stride;
+  if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
+  else f_stride = buffers->get_stride(nlocal);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+
+  acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5;
+  if (EVFLAG) {
+    if (EFLAG)
+      oebond = (acc_t)0.0;
+    if (vflag) {
+      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+    }
+  }
+
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(f_start,f_stride,fc)		  \
+    reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
+  #endif
+  {
+    int nfrom, nto, tid;
+    IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+
+    FORCE_T * _noalias const f = f_start + (tid * f_stride);
+    if (fix->need_zero(tid))
+      memset(f, 0, f_stride * sizeof(FORCE_T));
+
+    const int3_t * _noalias const bondlist = 
+      (int3_t *) neighbor->bondlist[0];
+
+    for (int n = nfrom; n < nto; n++) {
+      const int i1 = bondlist[n].a;
+      const int i2 = bondlist[n].b;
+      const int type = bondlist[n].t;
+
+      const flt_t ir0sq = fc.fc[type].ir0sq;
+      const flt_t k = fc.fc[type].k;
+      const flt_t sigma = fc.fc[type].sigma;
+      const flt_t sigmasq = sigma*sigma;
+      const flt_t epsilon = fc.fc[type].epsilon;
+
+      const flt_t delx = x[i1].x - x[i2].x;
+      const flt_t dely = x[i1].y - x[i2].y;
+      const flt_t delz = x[i1].z - x[i2].z;
+
+      const flt_t rsq = delx*delx + dely*dely + delz*delz;
+      flt_t rlogarg = (flt_t)1.0 - rsq * ir0sq;
+      flt_t irsq = (flt_t)1.0 / rsq;
+
+      // if r -> r0, then rlogarg < 0.0 which is an error
+      // issue a warning and reset rlogarg = epsilon
+      // if r > 2*r0 something serious is wrong, abort
+      
+      if (rlogarg < (flt_t)0.1) {
+        char str[128];
+        sprintf(str,"FENE bond too long: " BIGINT_FORMAT " "
+                TAGINT_FORMAT " " TAGINT_FORMAT " %g",
+                update->ntimestep,atom->tag[i1],atom->tag[i2],sqrt(rsq));
+        error->warning(FLERR,str,0);
+        if (rlogarg <= (flt_t)-3.0) error->one(FLERR,"Bad FENE bond");
+        rlogarg = (flt_t)0.1;
+      }
+      
+      flt_t fbond = -k/rlogarg;
+      
+      // force from LJ term
+      
+      flt_t sr2,sr6;
+      if (rsq < (flt_t)TWO_1_3*sigmasq) {
+	sr2 = sigmasq * irsq;
+        sr6 = sr2 * sr2 * sr2;
+        fbond += (flt_t)48.0 * epsilon * sr6 * (sr6 - (flt_t)0.5) * irsq;
+      }
+      
+      // energy
+
+      flt_t ebond;
+      if (EFLAG) {
+        ebond = (flt_t)-0.5 * k / ir0sq * log(rlogarg);
+        if (rsq < (flt_t)TWO_1_3 * sigmasq)
+          ebond += (flt_t)4.0 * epsilon * sr6 * (sr6 - (flt_t)1.0) + epsilon;
+      }
+
+      // apply force to each of 2 atoms
+
+      if (NEWTON_BOND || i1 < nlocal) {
+        f[i1].x += delx*fbond;
+        f[i1].y += dely*fbond;
+        f[i1].z += delz*fbond;
+      }
+
+      if (NEWTON_BOND || i2 < nlocal) {
+        f[i2].x -= delx*fbond;
+        f[i2].y -= dely*fbond;
+        f[i2].z -= delz*fbond;
+      }
+
+      if (EVFLAG) {
+	IP_PRE_ev_tally_bond(EFLAG, eatom, vflag, ebond, i1, i2, fbond, 
+                             delx, dely, delz, oebond, f, NEWTON_BOND, 
+                             nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
+      }
+    } // for n
+  } // omp parallel
+
+  if (EVFLAG) {
+    if (EFLAG)
+      energy += oebond;
+    if (vflag) {
+      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
+    }
+  }
+
+  fix->set_reduce_flag();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BondFENEIntel::init_style()
+{
+  BondFENE::init_style();
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  _use_base = 0;
+  if (fix->offload_balance() != 0.0) {
+    _use_base = 1;
+    return;
+  }
+  #endif
+
+  fix->bond_init_check();
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void BondFENEIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                         IntelBuffers<flt_t,acc_t> *buffers)
+{
+  const int bp1 = atom->nbondtypes + 1;
+  fc.set_ntypes(bp1,memory);
+
+  for (int i = 0; i < bp1; i++) {
+    fc.fc[i].k = k[i];
+    fc.fc[i].ir0sq = 1.0 / (r0[i] * r0[i]);
+    fc.fc[i].sigma = sigma[i];
+    fc.fc[i].epsilon = epsilon[i];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void BondFENEIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
+	                                              Memory *memory) {
+  if (nbondtypes != _nbondtypes) {
+    if (_nbondtypes > 0)
+      _memory->destroy(fc);
+    
+    if (nbondtypes > 0)
+      _memory->create(fc,nbondtypes,"bondfeneintel.fc");
+  }
+  _nbondtypes = nbondtypes;
+  _memory = memory;
+}
--- a/src/USER-INTEL/bond_fene_intel.h
+++ b/src/USER-INTEL/bond_fene_intel.h
@ -0,0 +1,88 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Stan Moore (Sandia)
+------------------------------------------------------------------------- */
+
+#ifdef BOND_CLASS
+
+BondStyle(fene/intel,BondFENEIntel)
+
+#else
+
+#ifndef LMP_BOND_FENE_INTEL_H
+#define LMP_BOND_FENE_INTEL_H
+
+#include <stdio.h>
+#include "bond_fene.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class BondFENEIntel : public BondFENE {
+ public:
+  BondFENEIntel(class LAMMPS *);
+  virtual ~BondFENEIntel();
+  virtual void compute(int, int);
+  virtual void init_style();
+
+ protected:
+  FixIntel *fix;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
+	    const ForceConst<flt_t> &fc);
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  int _use_base;
+  #endif
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t k, ir0sq, sigma, epsilon; } fc_packed1;
+    fc_packed1 *fc;
+
+    ForceConst() : _nbondtypes(0)  {}
+    ~ForceConst() { set_ntypes(0, NULL); }
+
+    void set_ntypes(const int nbondtypes, Memory *memory);
+
+   private:
+    int _nbondtypes;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Incorrect args for bond coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+*/
--- a/src/USER-INTEL/dihedral_charmm_intel.cpp
+++ b/src/USER-INTEL/dihedral_charmm_intel.cpp
@ -28,12 +28,22 @@
 #include "update.h"
 #include "error.h"

+#ifdef LMP_USE_AVXCD
+#if (__INTEL_COMPILER_BUILD_DATE > 20160414)
+#define LMP_USE_AVXCD_DHC
+#endif
+#endif
+
+#ifdef LMP_USE_AVXCD_DHC
+#include "intel_simd.h"
+using namespace ip_simd;
+#endif
+
 #include "suffix.h"
 using namespace LAMMPS_NS;

 #define PTOLERANCE (flt_t)1.05
 #define MTOLERANCE (flt_t)-1.05
-#define SMALL      (flt_t)0.001
 typedef struct { int a,b,c,d,t;  } int5_t;

 /* ---------------------------------------------------------------------- */
@ -102,6 +112,8 @@ void DihedralCharmmIntel::compute(int eflag, int vflag,
  }
 }

+#ifndef LMP_USE_AVXCD_DHC
+
 template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void DihedralCharmmIntel::eval(const int vflag, 
 			       IntelBuffers<flt_t,acc_t> *buffers,
@ -388,6 +400,8 @@ void DihedralCharmmIntel::eval(const int vflag,
 	      f[i4].w += evdwl;
 	  }
 	}
+	//	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
+	//				   delx, dely, delz);
 	if (vflag) {                                                    
 	  spv0 += ev_pre * delx * delx * fpair;                               
 	  spv1 += ev_pre * dely * dely * fpair;                               
@ -448,6 +462,442 @@ void DihedralCharmmIntel::eval(const int vflag,
  fix->set_reduce_flag();
 }

+#else
+
+/* ----------------------------------------------------------------------
+
+Vector intrinsics are temporarily being used for the Stillinger-Weber
+potential to allow for advanced features in the AVX512 instruction set to
+be exploited on early hardware. We hope to see compiler improvements for
+AVX512 that will eliminate this requirement, so it is not recommended to
+develop code based on the intrinsics implementation. Please e-mail the
+authors for more details.
+
+------------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void DihedralCharmmIntel::eval(const int vflag, 
+			       IntelBuffers<flt_t,acc_t> *buffers,
+			       const ForceConst<flt_t> &fc)
+
+{
+  typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t;
+  typedef typename SIMD_type<acc_t>::SIMD_vec SIMD_acc_t;
+  const int swidth = SIMD_type<flt_t>::width();
+
+  const int inum = neighbor->ndihedrallist;
+  if (inum == 0) return;
+
+  ATOM_T * _noalias const x = buffers->get_x(0);
+  flt_t * _noalias const q = buffers->get_q(0);
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  int f_stride;
+  if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
+  else f_stride = buffers->get_stride(nlocal);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+
+  acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
+  acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
+  if (EVFLAG) {
+    if (EFLAG)
+      oevdwl = oecoul = oedihedral = (acc_t)0.0;
+    if (vflag) {
+      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+      opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
+    }
+  }
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(f_start,f_stride,fc)		  \
+    reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
+	      opv0,opv1,opv2,opv3,opv4,opv5)
+  #endif
+  {
+    int nfrom, nto, tid;
+    IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+
+    FORCE_T * _noalias const f = f_start + (tid * f_stride);
+    if (fix->need_zero(tid))
+      memset(f, 0, f_stride * sizeof(FORCE_T));
+
+    const int * _noalias const dihedrallist = 
+      (int *) neighbor->dihedrallist[0];
+    const flt_t * _noalias const weight = &(fc.weight[0]);
+    const flt_t * _noalias const x_f = &(x[0].x);
+    const flt_t * _noalias const cos_shift = &(fc.bp[0].cos_shift);
+    const flt_t * _noalias const sin_shift = &(fc.bp[0].sin_shift);
+    const flt_t * _noalias const k = &(fc.bp[0].k);
+    const int * _noalias const multiplicity = &(fc.bp[0].multiplicity);
+    const flt_t * _noalias const plj1 = &(fc.ljp[0][0].lj1);
+    const flt_t * _noalias const plj2 = &(fc.ljp[0][0].lj2);
+    const flt_t * _noalias const plj3 = &(fc.ljp[0][0].lj3);
+    const flt_t * _noalias const plj4 = &(fc.ljp[0][0].lj4);
+    acc_t * _noalias const pforce= &(f[0].x);
+    acc_t * _noalias const featom = &(f[0].w);
+    const flt_t qqrd2e = force->qqrd2e;
+
+    SIMD_acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
+    SIMD_acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
+    if (EVFLAG) {
+      if (EFLAG) {
+	sevdwl = SIMD_set((acc_t)0.0);
+	secoul = SIMD_set((acc_t)0.0);
+	sedihedral = SIMD_set((acc_t)0.0);
+      }
+      if (vflag) {
+	sv0 = SIMD_set((acc_t)0.0);
+	sv1 = SIMD_set((acc_t)0.0);
+	sv2 = SIMD_set((acc_t)0.0);
+	sv3 = SIMD_set((acc_t)0.0);
+	sv4 = SIMD_set((acc_t)0.0);
+	sv5 = SIMD_set((acc_t)0.0);
+	spv0 = SIMD_set((acc_t)0.0);
+	spv1 = SIMD_set((acc_t)0.0);
+	spv2 = SIMD_set((acc_t)0.0);
+	spv3 = SIMD_set((acc_t)0.0);
+	spv4 = SIMD_set((acc_t)0.0);
+	spv5 = SIMD_set((acc_t)0.0);
+      }
+    }
+
+    SIMD_int n_offset = SIMD_set(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50,
+				 55, 60, 65, 70, 75) + (nfrom * 5);
+    const int nto5 = nto * 5;
+    const int nlocals4 = nlocal << 4;
+    const SIMD_int simd_nlocals4 = SIMD_set(nlocals4);
+    const int ntypes = atom->ntypes + 1;
+
+    for (int n = nfrom; n < nto; n += swidth) {
+      SIMD_mask nmask = n_offset < nto5;
+      SIMD_int i1 = SIMD_gather(nmask, dihedrallist, n_offset);
+      const SIMD_flt_t q1 = SIMD_gather(nmask, q, i1);
+      i1 = i1 << 4;
+      const SIMD_int i2 = SIMD_gather(nmask, dihedrallist+1, n_offset) << 4;
+      const SIMD_int i3 = SIMD_gather(nmask, dihedrallist+2, n_offset) << 4;
+      SIMD_int i4 = SIMD_gather(nmask, dihedrallist+3, n_offset);
+      const SIMD_flt_t q4 = SIMD_gather(nmask, q, i4);
+      i4 = i4 << 4;
+      SIMD_int type = SIMD_gather(nmask, dihedrallist+4, n_offset);
+      const SIMD_flt_t tweight = SIMD_gather(nmask, weight, type);
+      type = type << 2;
+      n_offset = n_offset + swidth * 5;
+
+      // 1st bond
+
+      SIMD_flt_t x1, x2, y1, y2, z1, z2;
+      SIMD_int itype;
+
+      SIMD_atom_gather(nmask, x_f, i1, x1, y1, z1, itype);
+      SIMD_atom_gather(nmask, x_f, i2, x2, y2, z2);
+
+      const SIMD_flt_t vb1x = x1 - x2;
+      const SIMD_flt_t vb1y = y1 - y2;
+      const SIMD_flt_t vb1z = z1 - z2;
+
+      // 2nd bond
+
+      SIMD_flt_t x3, y3, z3;
+
+      SIMD_atom_gather(nmask, x_f, i3, x3, y3, z3);
+
+      const SIMD_flt_t vb2xm = x2 - x3;
+      const SIMD_flt_t vb2ym = y2 - y3;
+      const SIMD_flt_t vb2zm = z2 - z3;
+
+      // 3rd bond
+      
+      SIMD_flt_t x4, y4, z4;
+      SIMD_int jtype;
+
+      SIMD_atom_gather(nmask, x_f, i4, x4, y4, z4, jtype);
+
+      const SIMD_flt_t vb3x = x4 - x3;
+      const SIMD_flt_t vb3y = y4 - y3;
+      const SIMD_flt_t vb3z = z4 - z3;
+
+      // 1-4
+
+      const SIMD_flt_t delx = x1 - x4;
+      const SIMD_flt_t dely = y1 - y4;
+      const SIMD_flt_t delz = z1 - z4;
+
+      // c,s calculation
+
+      const SIMD_flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
+      const SIMD_flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
+      const SIMD_flt_t az = vb1x*vb2ym - vb1y*vb2xm;
+      const SIMD_flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
+      const SIMD_flt_t by = vb3z*vb2xm - vb3x*vb2zm;
+      const SIMD_flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
+
+      const SIMD_flt_t rasq = ax*ax + ay*ay + az*az;
+      const SIMD_flt_t rbsq = bx*bx + by*by + bz*bz;
+      const SIMD_flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
+      const SIMD_flt_t rg = SIMD_sqrt(rgsq);
+
+      const SIMD_flt_t szero = SIMD_set((flt_t)0.0);
+      const SIMD_flt_t rginv = SIMD_rcpz(rg > szero, rg);
+      const SIMD_flt_t ra2inv = SIMD_rcpz(rasq > szero, rasq);
+      const SIMD_flt_t rb2inv = SIMD_rcpz(rbsq > szero, rbsq);
+      const SIMD_flt_t rabinv = SIMD_sqrt(ra2inv*rb2inv);
+
+      SIMD_flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
+      const SIMD_flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
+
+      // error check
+      const SIMD_flt_t one = SIMD_set((flt_t)1.0);
+      const SIMD_flt_t mone = SIMD_set((flt_t)-1.0);
+
+      const SIMD_flt_t ptol = SIMD_set(PTOLERANCE);
+      const SIMD_flt_t ntol = SIMD_set(MTOLERANCE);
+      if (c > ptol || c < ntol)
+	if (screen)
+          error->warning(FLERR,"Dihedral problem.");
+
+      c = SIMD_set(c, c > one, one);
+      c = SIMD_set(c, c < mone, mone);
+
+      const SIMD_flt_t tcos_shift = SIMD_gather(nmask, cos_shift, type);
+      const SIMD_flt_t tsin_shift = SIMD_gather(nmask, sin_shift, type);
+      const SIMD_flt_t tk = SIMD_gather(nmask, k, type);
+      const SIMD_int m = SIMD_gather(nmask, multiplicity, type);
+
+      SIMD_flt_t p(one);
+      SIMD_flt_t ddf1(szero);
+      SIMD_flt_t df1(szero);
+      
+      const int m_max = SIMD_max(m);
+
+      for (int i = 0; i < m_max; i++) {
+	const SIMD_mask my_m = i < m;
+	ddf1 = SIMD_set(ddf1, my_m, p*c - df1*s);
+	df1 = SIMD_set(df1, my_m, p*s + df1*c);
+	p = SIMD_set(p, my_m, ddf1);
+      }
+
+      SIMD_flt_t multf;
+      SIMD_cast(-m,multf);
+      p = p*tcos_shift + df1*tsin_shift;
+      df1 = df1*tcos_shift - ddf1*tsin_shift;
+      df1 = df1 * multf;
+      p = p + one;
+      
+      SIMD_mask mzero = (m == SIMD_set((int)0));
+      p = SIMD_set(p, mzero, one + tcos_shift);
+      df1 = SIMD_set(df1, mzero, szero);
+
+      const SIMD_flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
+      const SIMD_flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
+      const SIMD_flt_t fga = fg*ra2inv*rginv;
+      const SIMD_flt_t hgb = hg*rb2inv*rginv;
+      const SIMD_flt_t gaa = -ra2inv*rg;
+      const SIMD_flt_t gbb = rb2inv*rg;
+
+      const SIMD_flt_t dtfx = gaa*ax;
+      const SIMD_flt_t dtfy = gaa*ay;
+      const SIMD_flt_t dtfz = gaa*az;
+      const SIMD_flt_t dtgx = fga*ax - hgb*bx;
+      const SIMD_flt_t dtgy = fga*ay - hgb*by;
+      const SIMD_flt_t dtgz = fga*az - hgb*bz;
+      const SIMD_flt_t dthx = gbb*bx;
+      const SIMD_flt_t dthy = gbb*by;
+      const SIMD_flt_t dthz = gbb*bz;
+
+      const SIMD_flt_t df = -tk * df1;
+
+      const SIMD_flt_t sx2 = df*dtgx;
+      const SIMD_flt_t sy2 = df*dtgy;
+      const SIMD_flt_t sz2 = df*dtgz;
+
+      SIMD_flt_t f1x = df*dtfx;
+      SIMD_flt_t f1y = df*dtfy;
+      SIMD_flt_t f1z = df*dtfz;
+
+      SIMD_flt_t f2x = sx2 - f1x;
+      SIMD_flt_t f2y = sy2 - f1y;
+      SIMD_flt_t f2z = sz2 - f1z;
+
+      SIMD_flt_t f4x = df*dthx;
+      SIMD_flt_t f4y = df*dthy;
+      SIMD_flt_t f4z = df*dthz;
+
+      SIMD_flt_t f3x = -sx2 - f4x;
+      SIMD_flt_t f3y = -sy2 - f4y;
+      SIMD_flt_t f3z = -sz2 - f4z;
+
+      SIMD_flt_t qdeng;
+      if (EVFLAG) {
+	SIMD_flt_t ev_pre;
+	if (NEWTON_BOND) ev_pre = one;
+	else {
+	  ev_pre = szero;
+	  const SIMD_flt_t quarter = SIMD_set((flt_t)0.25);
+	  ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4, ev_pre, quarter);
+	  ev_pre = SIMD_add(ev_pre, i2 < simd_nlocals4, ev_pre, quarter);
+	  ev_pre = SIMD_add(ev_pre, i3 < simd_nlocals4, ev_pre, quarter);
+	  ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4, ev_pre, quarter);
+	}
+	SIMD_zero_masked(nmask, ev_pre);
+	if (EFLAG) {
+	  const SIMD_flt_t deng = tk * p;
+	  sedihedral = SIMD_ev_add(sedihedral, ev_pre * deng);
+	  if (eatom) {
+	    qdeng = deng * SIMD_set((flt_t)0.25);
+	    SIMD_mask newton_mask;
+	    if (NEWTON_BOND) newton_mask = nmask;
+	    if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4);
+	    SIMD_flt_t ieng = qdeng;
+	    SIMD_jeng_update(newton_mask, featom, i2, ieng);
+	    ieng = qdeng;
+	    if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4);
+	    SIMD_jeng_update(newton_mask, featom, i3, ieng);
+	  }
+	}
+	if (vflag) {
+          sv0 = SIMD_ev_add(sv0, ev_pre*(vb1x*f1x-vb2xm*f3x+(vb3x-vb2xm)*f4x));
+	  sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y));
+	  sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z));
+	  sv3 = SIMD_ev_add(sv3, ev_pre*(vb1x*f1y-vb2xm*f3y+(vb3x-vb2xm)*f4y));
+	  sv4 = SIMD_ev_add(sv4, ev_pre*(vb1x*f1z-vb2xm*f3z+(vb3x-vb2xm)*f4z));
+	  sv5 = SIMD_ev_add(sv5, ev_pre*(vb1y*f1z-vb2ym*f3z+(vb3y-vb2ym)*f4z));
+	}
+      }
+
+      SIMD_mask newton_mask;
+      if (NEWTON_BOND) newton_mask = nmask;
+      if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4);
+      SIMD_safe_jforce(newton_mask, pforce, i2, f2x, f2y, f2z);
+      if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4);
+      SIMD_safe_jforce(newton_mask, pforce, i3, f3x, f3y, f3z);
+
+      // 1-4 LJ and Coulomb interactions
+      // tally energy/virial in pair, using newton_bond as newton flag
+
+      const SIMD_flt_t rsq = delx*delx + dely*dely + delz*delz;
+      const SIMD_flt_t r2inv = SIMD_rcpz(nmask, rsq);
+      const SIMD_flt_t r6inv = r2inv*r2inv*r2inv;
+
+      const SIMD_flt_t simd_qqrd2e = SIMD_set(qqrd2e);
+      SIMD_flt_t forcecoul;
+      if (implicit) forcecoul = simd_qqrd2e * q1 * q4 * r2inv;
+      else forcecoul = simd_qqrd2e * q1 * q4 * SIMD_sqrt(r2inv);
+
+      const SIMD_int ijtype = (itype * ntypes + jtype) << 2;
+      const SIMD_flt_t lj1 = SIMD_gather(nmask, plj1, ijtype);
+      const SIMD_flt_t lj2 = SIMD_gather(nmask, plj2, ijtype);
+      const SIMD_flt_t forcelj = r6inv * (lj1 * r6inv - lj2);
+      const SIMD_flt_t fpair = tweight * (forcelj + forcecoul) * r2inv;
+
+      f1x = f1x + delx * fpair;
+      f1y = f1y + dely * fpair;
+      f1z = f1z + delz * fpair;
+      f4x = f4x - delx * fpair;
+      f4y = f4y - dely * fpair;
+      f4z = f4z - delz * fpair;
+
+      if (EVFLAG) {
+	SIMD_flt_t ev_pre;
+	if (NEWTON_BOND) ev_pre = one;
+	else {
+	  ev_pre = szero;
+          const SIMD_flt_t half = SIMD_set((flt_t)0.5);
+          ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4,ev_pre,half);
+          ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4,ev_pre,half);
+	}
+	SIMD_zero_masked(nmask, ev_pre);
+
+	if (EFLAG) {
+	  const SIMD_flt_t ecoul = tweight * forcecoul;
+	  const SIMD_flt_t lj3 = SIMD_gather(nmask, plj3, ijtype);
+	  const SIMD_flt_t lj4 = SIMD_gather(nmask, plj4, ijtype);
+	  SIMD_flt_t evdwl = tweight * r6inv * (lj3 * r6inv - lj4);
+	  secoul = SIMD_ev_add(secoul, ev_pre * ecoul);
+	  sevdwl = SIMD_ev_add(sevdwl, ev_pre * evdwl);
+	  if (eatom) {
+ 	    const SIMD_flt_t half = SIMD_set((flt_t)0.5);
+	    evdwl = evdwl * half;
+	    evdwl = evdwl + half * ecoul + qdeng;
+
+            if (NEWTON_BOND) newton_mask = nmask;
+            if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i1, simd_nlocals4);
+            SIMD_flt_t ieng = evdwl;
+            SIMD_jeng_update(newton_mask, featom, i1, ieng);
+            ieng = evdwl;
+            if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i4, simd_nlocals4);
+            SIMD_jeng_update(newton_mask, featom, i4, ieng);
+	  }
+	}
+	if (vflag) {                                                    
+          spv0 = SIMD_ev_add(spv0, ev_pre * delx * delx * fpair);
+	  spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair);
+	  spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair);
+	  spv3 = SIMD_ev_add(spv3, ev_pre * delx * dely * fpair);
+	  spv4 = SIMD_ev_add(spv4, ev_pre * delx * delz * fpair);
+	  spv5 = SIMD_ev_add(spv5, ev_pre * dely * delz * fpair);
+	}                                                                    
+      }
+
+      if (NEWTON_BOND) newton_mask = nmask;
+      if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i1, simd_nlocals4);
+      SIMD_safe_jforce(newton_mask, pforce, i1, f1x, f1y, f1z);
+      if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i4, simd_nlocals4);
+      SIMD_safe_jforce(newton_mask, pforce, i4, f4x, f4y, f4z);
+    } // for n
+
+    if (EVFLAG) {
+      if (EFLAG) {
+	oedihedral += SIMD_sum(sedihedral);
+	oecoul += SIMD_sum(secoul);
+	oevdwl += SIMD_sum(sevdwl);
+      }
+      if (vflag) {
+	ov0 += SIMD_sum(sv0); 
+	ov1 += SIMD_sum(sv1); 
+	ov2 += SIMD_sum(sv2); 
+	ov3 += SIMD_sum(sv3); 
+	ov4 += SIMD_sum(sv4); 
+	ov5 += SIMD_sum(sv5);
+	opv0 += SIMD_sum(spv0); 
+	opv1 += SIMD_sum(spv1); 
+	opv2 += SIMD_sum(spv2); 
+	opv3 += SIMD_sum(spv3); 
+	opv4 += SIMD_sum(spv4); 
+	opv5 += SIMD_sum(spv5);
+      }
+    }
+  } // omp parallel
+
+  if (EVFLAG) {
+    if (EFLAG) {
+      energy += oedihedral;
+      force->pair->eng_vdwl += oevdwl;
+      force->pair->eng_coul += oecoul;
+    }
+    if (vflag) {
+      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
+      force->pair->virial[0] += opv0;
+      force->pair->virial[1] += opv1;
+      force->pair->virial[2] += opv2;
+      force->pair->virial[3] += opv3;
+      force->pair->virial[4] += opv4;
+      force->pair->virial[5] += opv5;
+    }
+  }
+
+  fix->set_reduce_flag();
+}
+
+#endif
+
 /* ---------------------------------------------------------------------- */

 void DihedralCharmmIntel::init_style()
--- a/src/USER-INTEL/intel_intrinsics.h
+++ b/src/USER-INTEL/intel_intrinsics.h
@ -259,7 +259,12 @@ struct vector_ops<double, KNC> {
      return _mm512_invsqrt_pd(a);
    }
    static fvec sincos(fvec *cos, const fvec &a) {
+      #if __INTEL_COMPILER+0 < 1500
+      *reinterpret_cast<__m512d *>(cos) = _mm512_cos_pd(a);
+      return _mm512_sin_pd(a);
+      #else
      return _mm512_sincos_pd(reinterpret_cast<__m512d *>(cos), a);
+      #endif
    }
    static fscal reduce_add(const fvec &a) {
      return _mm512_reduce_add_pd(a);
@ -392,7 +397,12 @@ struct vector_ops<float, KNC> {
      return _mm512_invsqrt_ps(a);
    }
    static fvec sincos(fvec *cos, const fvec &a) {
+      #if __INTEL_COMPILER+0 < 1500
+      *reinterpret_cast<__m512 *>(cos) = _mm512_cos_ps(a);
+      return _mm512_sin_ps(a);
+      #else
      return _mm512_sincos_ps(reinterpret_cast<__m512 *>(cos), a);
+      #endif
    }
    static fscal reduce_add(const fvec &a) {
      return _mm512_reduce_add_ps(a);
--- a/src/USER-INTEL/intel_simd.h
+++ b/src/USER-INTEL/intel_simd.h
@ -37,10 +37,28 @@ authors for more details.

 namespace ip_simd {

-  typedef __m512i SIMD_int;
  typedef __mmask16 SIMD_mask;
-  typedef __m512 SIMD_float;
-  typedef __m512d SIMD_double;
+
+  struct SIMD_int {
+    __m512i v;
+    SIMD_int() {}
+    SIMD_int(const __m512i in) : v(in) {} 
+    operator __m512i() const { return v;}
+  };
+
+  struct SIMD_float {
+    __m512 v;
+    SIMD_float() {}
+    SIMD_float(const __m512 in) : v(in) {} 
+    operator __m512() const { return v;}
+  };
+
+  struct SIMD_double {
+    __m512d v;
+    SIMD_double() {}
+    SIMD_double(const __m512d in) : v(in) {} 
+    operator __m512d() const { return v;}
+  };

  template<class flt_t> 
  class SIMD_type {
@ -276,6 +294,18 @@ namespace ip_simd {
    return _mm512_mask_sub_pd(s,m,one,two);
  }

+  inline SIMD_int operator-(const SIMD_int &one) {
+    return _mm512_sub_epi32(SIMD_set((int)0),one);
+  }
+  
+  inline SIMD_float operator-(const SIMD_float &one) {
+    return _mm512_sub_ps(SIMD_set((float)0),one);
+  }
+  
+  inline SIMD_double operator-(const SIMD_double &one) {
+    return _mm512_sub_pd(SIMD_set((double)0),one);
+  }
+
  inline SIMD_int operator-(const SIMD_int &one, const SIMD_int &two) {
    return _mm512_sub_epi32(one,two);
  }
@ -694,17 +724,17 @@ namespace ip_simd {
  
  // -------- I/O operations

-  inline void SIMD_print(const SIMD_int &vec) {
+  inline void SIMD_print(const __m512i &vec) {
    for (int i = 0; i < 16; i++) 
      printf("%d ",(*((int*)&(vec) + (i))));
  }

-  inline void SIMD_print(const SIMD_float &vec) {
+  inline void SIMD_print(const __m512 &vec) {
    for (int i = 0; i < 16; i++) 
      printf("%f ",(*((float*)&(vec) + (i))));
  }

-  inline void SIMD_print(const SIMD_double &vec) {
+  inline void SIMD_print(const __m512d &vec) {
    for (int i = 0; i < 8; i++) 
      printf("%f ",(*((double*)&(vec) + (i))));
  }
--- a/src/USER-INTEL/neigh_half_bin_intel.cpp
+++ b/src/USER-INTEL/neigh_half_bin_intel.cpp
@ -1037,8 +1037,10 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
        #if defined(LMP_SIMD_COMPILER)
 	#ifdef _LMP_INTEL_OFFLOAD
 	int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
+	#if __INTEL_COMPILER+0 > 1499
 	#pragma vector aligned
        #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
+	#endif
 	#else
 	#pragma vector aligned
        #pragma simd
@ -1645,8 +1647,10 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
        #if defined(LMP_SIMD_COMPILER)
 	#ifdef _LMP_INTEL_OFFLOAD
 	int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
+	#if __INTEL_COMPILER+0 > 1499
 	#pragma vector aligned
        #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
+	#endif
 	#else
 	#pragma vector aligned
        #pragma simd
@ -2241,8 +2245,10 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
        #if defined(LMP_SIMD_COMPILER)
 	#ifdef _LMP_INTEL_OFFLOAD
 	int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
+	#if __INTEL_COMPILER+0 > 1499
 	#pragma vector aligned
        #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
+	#endif
 	#else
 	#pragma vector aligned
        #pragma simd
--- a/src/USER-INTEL/pair_lj_cut_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_intel.cpp
@ -323,7 +323,12 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
 		  lj4 = lj34i[jtype].lj4;
 		  offset = ljc12oi[jtype].offset;
 		}
-                evdwl = r6inv * (lj3 * r6inv - lj4) - offset;
+                evdwl = r6inv * (lj3 * r6inv - lj4);
+                #ifdef INTEL_VMASK
+		evdwl -= offset;
+		#else
+		if (rsq < cutsq) evdwl -= offset;
+		#endif
                if (!ONETYPE) evdwl *= factor_lj;
                sevdwl += ev_pre*evdwl;
                if (eatom) {