USER-INTEL add-ons from Mike

2017-08-15 17:12:07 -06:00 · 2017-08-15 17:12:07 -06:00 · 1d4d2155a2
parent 0b3f1b8a15
commit 1d4d2155a2
35 changed files with 9675 additions and 49 deletions
--- a/doc/src/JPG/user_intel.png
+++ b/doc/src/JPG/user_intel.png
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@ -29,8 +29,10 @@ Bond Styles: fene, harmonic :l
 Dihedral Styles: charmm, harmonic, opls :l
 Fixes: nve, npt, nvt, nvt/sllod :l
 Improper Styles: cvff, harmonic :l
-Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne,
-charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, sw, tersoff :l
+Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, 
+buck, eam, eam/alloy, eam/fs, gayberne, charmm/coul/charmm, 
+charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
+sw, tersoff :l
 K-Space Styles: pppm, pppm/disp :l
 :ule

--- a/doc/src/pair_airebo.txt
+++ b/doc/src/pair_airebo.txt
@ -7,10 +7,13 @@
 :line

 pair_style airebo command :h3
+pair_style airebo/intel command :h3
 pair_style airebo/omp command :h3
 pair_style airebo/morse command :h3
+pair_style airebo/morse/intel command :h3
 pair_style airebo/morse/omp command :h3
 pair_style rebo command :h3
+pair_style rebo/intel command :h3
 pair_style rebo/omp command :h3

 [Syntax:]
--- a/doc/src/pair_charmm.txt
+++ b/doc/src/pair_charmm.txt
@ -7,6 +7,7 @@
 :line

 pair_style lj/charmm/coul/charmm command :h3
+pair_style lj/charmm/coul/charmm/intel command :h3
 pair_style lj/charmm/coul/charmm/omp command :h3
 pair_style lj/charmm/coul/charmm/implicit command :h3
 pair_style lj/charmm/coul/charmm/implicit/omp command :h3
--- a/doc/src/pair_eam.txt
+++ b/doc/src/pair_eam.txt
@ -14,6 +14,7 @@ pair_style eam/omp command :h3
 pair_style eam/opt command :h3
 pair_style eam/alloy command :h3
 pair_style eam/alloy/gpu command :h3
+pair_style eam/alloy/intel command :h3
 pair_style eam/alloy/kk command :h3
 pair_style eam/alloy/omp command :h3
 pair_style eam/alloy/opt command :h3
@ -21,6 +22,7 @@ pair_style eam/cd command :h3
 pair_style eam/cd/omp command :h3
 pair_style eam/fs command :h3
 pair_style eam/fs/gpu command :h3
+pair_style eam/fs/intel command :h3
 pair_style eam/fs/kk command :h3
 pair_style eam/fs/omp command :h3
 pair_style eam/fs/opt command :h3
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
@ -14,7 +14,7 @@ SHFLAGS =	-fPIC
 DEPFLAGS =	-M

 LINK =		mpiicpc
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
 LIB =           -ltbbmalloc
 SIZE =		size

--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
@ -7,7 +7,7 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler

 CC =		mpicxx -cxx=icc
-OPTFLAGS =      -xAVX -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
+OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
 CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
                -fno-alias -ansi-alias -restrict $(OPTFLAGS)
 SHFLAGS =	-fPIC
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
@ -8,7 +8,7 @@ SHELL = /bin/sh

 export OMPI_CXX = icc
 CC =		mpicxx
-OPTFLAGS =      -xAVX -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
+OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
 CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
                -fno-alias -ansi-alias -restrict $(OPTFLAGS)
 SHFLAGS =	-fPIC
--- a/src/USER-INTEL/Install.sh
+++ b/src/USER-INTEL/Install.sh
@ -46,6 +46,7 @@ action npair_intel.h
 action npair_intel.cpp
 action intel_simd.h pair_sw_intel.cpp
 action intel_intrinsics.h pair_tersoff_intel.cpp
+action intel_intrinsics_airebo.h pair_airebo_intel.cpp
 action verlet_lrt_intel.h pppm.cpp
 action verlet_lrt_intel.cpp pppm.cpp

--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@ -4,9 +4,9 @@
                     --------------------------------
                     
             W. Michael Brown (Intel) michael.w.brown at intel.com
+                  Markus Hohnerbach (RWTH Aachen University)
                   William McDoniel (RWTH Aachen University)
                   Rodrigo Canales (RWTH Aachen University)
-                  Markus H<>hnerbach (RWTH Aachen University)
                           Stan Moore (Sandia)
 		   Ahmed E. Ismail (RWTH Aachen University)
                   Paolo Bientinesi (RWTH Aachen University)
--- a/src/USER-INTEL/TEST/README
+++ b/src/USER-INTEL/TEST/README
@ -8,6 +8,7 @@
 # in.intel.sw -	        Silicon benchmark with Stillinger-Weber
 # in.intel.tersoff -    Silicon benchmark with Tersoff
 # in.intel.water -      Coarse-grain water benchmark using Stillinger-Weber
+# in.intel.airebo -     Polyethelene benchmark with AIREBO
 #
 #############################################################################

@ -24,6 +25,7 @@
 # in.intel.sw -	           132.4               161.9
 # in.intel.tersoff -        83.3               101.1
 # in.intel.water -          53.4                90.3
+# in.intel.airebo -          7.3                11.8
 #
 #############################################################################

--- a/src/USER-INTEL/TEST/in.intel.airebo
+++ b/src/USER-INTEL/TEST/in.intel.airebo
@ -0,0 +1,47 @@
+# AIREBO polyethelene benchmark
+
+variable        N index on      # Newton Setting
+variable	w index 10	# Warmup Timesteps
+variable	t index 550	# Main Run Timesteps
+variable	m index 1	# Main Run Timestep Multiplier
+variable	n index 0	# Use NUMA Mapping for Multi-Node
+variable	p index 0	# Use Power Measurement
+variable	x index 4
+variable	y index 2
+variable	z index 2
+
+variable	xx equal 17*$x
+variable	yy equal 16*$y
+variable	zz equal 2*$z
+variable	rr equal floor($t*$m)
+variable        root getenv LMP_ROOT
+
+newton          $N
+if "$n > 0"	then "processors * * * grid numa"
+
+variable            root getenv LMP_ROOT
+
+units		    metal
+atom_style	    atomic
+
+read_data	    ${root}/examples/airebo/data.airebo
+
+replicate	    ${xx} ${yy} ${zz}
+
+neighbor	    0.5 bin
+neigh_modify	    delay 5 every 1
+
+pair_style	    airebo 3.0 1 1
+pair_coeff	    * * ${root}/potentials/CH.airebo C H
+
+velocity	    all create 300.0 761341
+
+fix		    1 all nve
+timestep	    0.0005
+
+thermo		    50
+
+if "$p > 0"	then "run_style verlet/power"
+
+if "$w > 0"	then "run $w"
+run		${rr}
--- a/src/USER-INTEL/TEST/in.intel.eam
+++ b/src/USER-INTEL/TEST/in.intel.eam
@ -5,7 +5,6 @@ variable	w index 10      # Warmup Timesteps
 variable	t index 3100    # Main Run Timesteps
 variable	m index 1       # Main Run Timestep Multiplier
 variable	n index 0       # Use NUMA Mapping for Multi-Node
-variable	b index 3       # Neighbor binsize
 variable	p index 0       # Use Power Measurement

 variable	x index 4
--- a/src/USER-INTEL/TEST/in.intel.rhodo
+++ b/src/USER-INTEL/TEST/in.intel.rhodo
@ -5,7 +5,6 @@ variable	w index 10	# Warmup Timesteps
 variable	t index 520	# Main Run Timesteps
 variable	m index 1	# Main Run Timestep Multiplier
 variable	n index 0	# Use NUMA Mapping for Multi-Node
-variable        b index 3       # Neighbor binsize
 variable	p index 0	# Use Power Measurement
 variable	c index 0	# 1 to use collectives for PPPM
 variable        d index 1       # 1 to use 'diff ad' for PPPM
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@ -30,6 +30,9 @@ IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
  _off_map_listlocal = 0;
  _ccachex = 0;
  _ncache_alloc = 0;
+  _ncachetag = 0;
+  _cutneighsq = 0;
+  _cutneighghostsq = 0;
  #ifdef _LMP_INTEL_OFFLOAD
  _separate_buffers = 0;
  _off_f = 0;
@ -447,12 +450,17 @@ void IntelBuffers<flt_t, acc_t>::free_ncache()
    flt_t *ncachez = _ncachez;
    int *ncachej = _ncachej;
    int *ncachejtype = _ncachejtype;
+    int *ncachetag = _ncachetag;

    #ifdef _LMP_INTEL_OFFLOAD
    if (_off_ncache) {
      #pragma offload_transfer target(mic:_cop) \
        nocopy(ncachex,ncachey,ncachez,ncachej:alloc_if(0) free_if(1)) \
        nocopy(ncachejtype:alloc_if(0) free_if(1))
+      if (ncachetag) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(ncachetag:alloc_if(0) free_if(1))
+      }
    }
    _off_ncache = 0;
    #endif
@ -462,8 +470,10 @@ void IntelBuffers<flt_t, acc_t>::free_ncache()
    lmp->memory->destroy(ncachez);
    lmp->memory->destroy(ncachej);
    lmp->memory->destroy(ncachejtype);
-
+    if (ncachetag)
+      lmp->memory->destroy(ncachetag);
    _ncache_alloc = 0;
+    _ncachetag = 0;
  }
 }

@ -480,7 +490,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
  const int vsize = _ncache_stride * nt;

  if (_ncache_alloc) {
-    if (vsize > _ncache_alloc)
+    if (vsize > _ncache_alloc || (need_tag() && _ncachetag == 0))
      free_ncache();
    #ifdef _LMP_INTEL_OFFLOAD
    else if (off_flag && _off_ncache == 0)
@ -495,6 +505,8 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
  lmp->memory->create(_ncachez, vsize, "_ncachez");
  lmp->memory->create(_ncachej, vsize, "_ncachej");
  lmp->memory->create(_ncachejtype, vsize, "_ncachejtype");
+  if (need_tag())
+    lmp->memory->create(_ncachetag, vsize, "_ncachetag");

  _ncache_alloc = vsize;

@ -513,6 +525,14 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
        nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \
        nocopy(ncachejtype:length(vsize) alloc_if(1) free_if(0))
    }
+    int tsize = vsize;
+    if (!need_tag()) {
+      tsize = 16;
+      lmp->memory->create(_ncachetag, tsize, "_ncachetag");
+    }
+    int *ncachetag = _ncachetag;
+    #pragma offload_transfer target(mic:_cop)			\
+      nocopy(ncachetag:length(tsize) alloc_if(1) free_if(0))
    _off_ncache = 1;
  }
  #endif
@ -548,7 +568,8 @@ void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall,
 /* ---------------------------------------------------------------------- */

 template <class flt_t, class acc_t>
-void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
+void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes, 
+					    const int use_ghost_cut)
 {
  if (ntypes != _ntypes) {
    if (_ntypes > 0) {
@ -558,16 +579,34 @@ void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
        #pragma offload_transfer target(mic:_cop) \
          nocopy(cutneighsqo:alloc_if(0) free_if(1))
      }
+      flt_t * cutneighghostsqo;
+      if (_cutneighghostsq && _off_threads > 0 && cutneighghostsqo != 0) {
+	cutneighghostsqo = _cutneighghostsq[0];
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(cutneighghostsqo:alloc_if(0) free_if(1))
+      }
      #endif
      lmp->memory->destroy(_cutneighsq);
+      if (_cutneighghostsq != 0) lmp->memory->destroy(_cutneighghostsq);
    }
    if (ntypes > 0) {
      lmp->memory->create(_cutneighsq, ntypes, ntypes, "_cutneighsq");
+      if (use_ghost_cut)
+	lmp->memory->create(_cutneighghostsq, ntypes, ntypes, 
+			    "_cutneighghostsq");
      #ifdef _LMP_INTEL_OFFLOAD
      flt_t * cutneighsqo = _cutneighsq[0];
+      const int ntypes2 = ntypes * ntypes;
      if (_off_threads > 0 && cutneighsqo != NULL) {
        #pragma offload_transfer target(mic:_cop) \
-          nocopy(cutneighsqo:length(ntypes * ntypes) alloc_if(1) free_if(0))
+          nocopy(cutneighsqo:length(ntypes2) alloc_if(1) free_if(0))
+      }
+      if (use_ghost_cut) {
+        flt_t * cutneighghostsqo = _cutneighghostsq[0];
+        if (_off_threads > 0 && cutneighghostsqo != NULL) {
+          #pragma offload_transfer target(mic:_cop) \
+            nocopy(cutneighghostsqo:length(ntypes2) alloc_if(1) free_if(0))
+        }
      }
      #endif
    }
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@ -109,12 +109,14 @@ class IntelBuffers {

  void free_ncache();
  void grow_ncache(const int off_flag, const int nthreads);
+  void grow_ncachetag(const int off_flag, const int nthreads);
  inline int ncache_stride() { return _ncache_stride; }
  inline flt_t * get_ncachex() { return _ncachex; }
  inline flt_t * get_ncachey() { return _ncachey; }
  inline flt_t * get_ncachez() { return _ncachez; }
  inline int * get_ncachej() { return _ncachej; }
  inline int * get_ncachejtype() { return _ncachejtype; }
+  inline int * get_ncachetag() { return _ncachetag; }

  inline int get_max_nbors() {
    int mn = lmp->neighbor->oneatom * sizeof(int) /
@ -131,7 +133,7 @@ class IntelBuffers {
      _grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
  }

-  void set_ntypes(const int ntypes);
+  void set_ntypes(const int ntypes, const int use_ghost_cut = 0);

  inline int * firstneigh(const NeighList *list) { return _list_alloc; }
  inline int * cnumneigh(const NeighList *list) { return _cnumneigh; }
@ -162,6 +164,7 @@ class IntelBuffers {
  inline void zero_ev()
    { for (int i = 0; i < 8; i++) _ev_global[i] = _ev_global_host[i] = 0.0; }
  inline flt_t ** get_cutneighsq() { return _cutneighsq; }
+  inline flt_t ** get_cutneighghostsq() { return _cutneighghostsq; }
  inline int get_off_threads() { return _off_threads; }
  #ifdef _LMP_INTEL_OFFLOAD
  inline void set_off_params(const int n, const int cop,
@ -274,13 +277,10 @@ class IntelBuffers {
             used_ghost * sizeof(flt_t));
    }
  }
+  #endif

  inline int need_tag() { return _need_tag; }
  inline void need_tag(const int nt) { _need_tag = nt; }
-  #else
-  inline int need_tag() { return 0; }
-  inline void need_tag(const int nt) { }
-  #endif

  double memory_usage(const int nthreads);

@ -298,7 +298,7 @@ class IntelBuffers {
  int _list_alloc_atoms;
  int *_list_alloc, *_cnumneigh, *_atombin, *_binpacked;

-  flt_t **_cutneighsq;
+  flt_t **_cutneighsq, **_cutneighghostsq;
  int _ntypes;

  int _ccache_stride;
@ -307,7 +307,10 @@ class IntelBuffers {

  int _ncache_stride, _ncache_alloc;
  flt_t *_ncachex, *_ncachey, *_ncachez;
-  int *_ncachej, *_ncachejtype;
+  int *_ncachej, *_ncachejtype, *_ncachetag;
+
+  int _need_tag, _host_nmax;
+
  #ifdef LMP_USE_AVXCD
  int _ccache_stride3;
  acc_t * _ccachef;
@ -324,7 +327,6 @@ class IntelBuffers {
  int *_off_map_special, *_off_map_nspecial, *_off_map_tag;
  int *_off_map_numneigh;
  bool _off_list_alloc;
-  int _need_tag, _host_nmax;
  #endif

  int _buf_size, _buf_local_size;
--- a/src/USER-INTEL/intel_intrinsics_airebo.h
+++ b/src/USER-INTEL/intel_intrinsics_airebo.h
--- a/src/USER-INTEL/nbin_intel.cpp
+++ b/src/USER-INTEL/nbin_intel.cpp
@ -211,6 +211,8 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
    for (i = nall-1; i >= nlocal; i--) {
      if (mask[i] & bitmask) {
        ibin = coord2bin(atom->x[i]);
+	// Only necessary to store when neighboring ghost
+	atombin[i] = ibin;
        bins[i] = binhead[ibin];
        binhead[ibin] = i;
      }
@ -222,14 +224,10 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
      binhead[ibin] = i;
    }
  } else {
-    for (i = nall-1; i >= nlocal; i--) {
+    for (i = nall-1; i >= 0; i--) {
      ibin = coord2bin(atom->x[i]);
-      bins[i] = binhead[ibin];
-      binhead[ibin] = i;
-    }
-    for (i = nlocal-1; i >= 0; i--) {
-      ibin = coord2bin(atom->x[i]);
-      atombin[i]=ibin;
+      // Only necessary to store for ghost when neighboring ghost
+      atombin[i] = ibin;
      bins[i] = binhead[ibin];
      binhead[ibin] = i;
    }
--- a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
@ -0,0 +1,593 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "npair_full_bin_ghost_intel.h"
+#include "neighbor.h"
+#include "nstencil.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "molecule.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+NPairFullBinGhostIntel::NPairFullBinGhostIntel(LAMMPS *lmp) : NPairIntel(lmp) {}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction for all neighbors
+   include neighbors of ghost atoms, but no "special neighbors" for ghosts
+   every neighbor pair appears in list of both atoms i and j
+------------------------------------------------------------------------- */
+
+void NPairFullBinGhostIntel::build(NeighList *list)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_fix->offload_noghost())
+    error->all(FLERR,
+      "The 'ghost no' option cannot be used with this USER-INTEL pair style.");
+  #endif
+
+  if (nstencil > INTEL_MAX_STENCIL_CHECK)
+    error->all(FLERR, "Too many neighbor bins for USER-INTEL package.");
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (exclude)
+    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
+  #endif
+
+  if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
+    fbi(list, _fix->get_mixed_buffers());
+  else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    fbi(list, _fix->get_double_buffers());
+  else
+    fbi(list, _fix->get_single_buffers());
+
+  _fix->stop_watch(TIME_HOST_NEIGHBOR);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t>
+void NPairFullBinGhostIntel::fbi(NeighList * list, 
+				 IntelBuffers<flt_t,acc_t> * buffers) 
+{
+  const int nlocal = atom->nlocal;
+  const int nall = atom->nlocal + atom->nghost;
+  list->inum = atom->nlocal;
+  list->gnum = atom->nghost;
+
+  int host_start = _fix->host_start_neighbor();
+  const int off_end = _fix->offload_end_neighbor();
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (off_end) grow_stencil();
+  if (_fix->full_host_list()) host_start = 0;
+  int offload_noghost = _fix->offload_noghost();
+  #endif
+
+  // only uses offload_end_neighbor to check whether we are doing offloading
+  // at all, no need to correct this later
+  buffers->grow_list(list, nall, comm->nthreads, off_end,
+		     _fix->nbor_pack_width());
+
+  int need_ic = 0;
+  if (atom->molecular)
+    dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
+			 neighbor->cutneighmax);
+
+  if (need_ic) {
+    fbi<flt_t,acc_t,1>(1, list, buffers, 0, off_end);
+    fbi<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal);
+  } else {
+    fbi<flt_t,acc_t,0>(1, list, buffers, 0, off_end);
+    fbi<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int need_ic>
+void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, 
+				 IntelBuffers<flt_t,acc_t> * buffers,
+				 const int pstart, const int pend) {
+  if (pend-pstart == 0) return;
+
+  const int nall = atom->nlocal + atom->nghost;
+  int pad = 1;
+  int nall_t = nall;
+  const int aend = nall;
+
+  const int pack_width = _fix->nbor_pack_width();
+  const ATOM_T * _noalias const x = buffers->get_x();
+  int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int e_nall = nall_t;
+
+  const int molecular = atom->molecular;
+  int *ns = NULL;
+  tagint *s = NULL;
+  int tag_size = 0, special_size;
+  if (buffers->need_tag()) tag_size = e_nall;
+  if (molecular) {
+    s = atom->special[0];
+    ns = atom->nspecial[0];
+    special_size = aend;
+  } else {
+    s = &buffers->_special_holder;
+    ns = &buffers->_nspecial_holder;
+    special_size = 0;
+  }
+  const tagint * _noalias const special = s;
+  const int * _noalias const nspecial = ns;
+  const int maxspecial = atom->maxspecial;
+  const tagint * _noalias const tag = atom->tag;
+
+  int * _noalias const ilist = list->ilist;
+  int * _noalias numneigh = list->numneigh;
+  int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int nstencil = this->nstencil;
+  const int * _noalias const stencil = this->stencil;
+  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
+  const flt_t * _noalias const cutneighghostsq = 
+    buffers->get_cutneighghostsq()[0];
+  const int ntypes = atom->ntypes + 1;
+  const int nlocal = atom->nlocal;
+
+  #ifndef _LMP_INTEL_OFFLOAD
+  int * const mask = atom->mask;
+  tagint * const molecule = atom->molecule;
+  #endif
+
+  int *molindex = atom->molindex;
+  int *molatom = atom->molatom;
+  Molecule **onemols = atom->avec->onemols;
+  int moltemplate;
+  if (molecular == 2) moltemplate = 1;
+  else moltemplate = 0;
+  if (moltemplate) 
+    error->all(FLERR, 
+	       "Can't use moltemplate with npair style full/bin/ghost/intel.");
+
+  int tnum;
+  int *overflow;
+  double *timer_compute;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    timer_compute = _fix->off_watch_neighbor();
+    tnum = buffers->get_off_threads();
+    overflow = _fix->get_off_overflow_flag();
+    _fix->stop_watch(TIME_HOST_NEIGHBOR);
+    _fix->start_watch(TIME_OFFLOAD_LATENCY);
+  } else
+  #endif
+  {
+    tnum = comm->nthreads;
+    overflow = _fix->get_overflow_flag();
+  }
+  const int nthreads = tnum;
+  const int maxnbors = buffers->get_max_nbors();
+  int * _noalias const atombin = buffers->get_atombin();
+  const int * _noalias const binpacked = buffers->get_binpacked();
+
+  const int xperiodic = domain->xperiodic;
+  const int yperiodic = domain->yperiodic;
+  const int zperiodic = domain->zperiodic;
+  const flt_t xprd_half = domain->xprd_half;
+  const flt_t yprd_half = domain->yprd_half;
+  const flt_t zprd_half = domain->zprd_half;
+
+  flt_t * _noalias const ncachex = buffers->get_ncachex();
+  flt_t * _noalias const ncachey = buffers->get_ncachey();
+  flt_t * _noalias const ncachez = buffers->get_ncachez();
+  int * _noalias const ncachej = buffers->get_ncachej();
+  int * _noalias const ncachejtype = buffers->get_ncachejtype();
+  int * _noalias const ncachetag = buffers->get_ncachetag();
+  const int ncache_stride = buffers->ncache_stride();
+
+  const int mbinx = this->mbinx;
+  const int mbiny = this->mbiny;
+  const int mbinz = this->mbinz;
+  const int * const stencilxyz = &this->stencilxyz[0][0];
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  const int * _noalias const binhead = this->binhead;
+  const int * _noalias const bins = this->bins;
+  const int cop = _fix->coprocessor_number();
+  const int separate_buffers = _fix->separate_buffers();
+  #pragma offload target(mic:cop) if(offload) \
+    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
+    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
+    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
+    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
+    in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
+    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
+    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
+    in(cutneighghostsq:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
+    in(atombin:length(aend) alloc_if(0) free_if(0)) \
+    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
+    in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
+    in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \
+    in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
+    in(separate_buffers,aend,nlocal,molecular,ntypes,mbinx,mbiny) \
+    in(mbinz,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
+    in(stencilxyz:length(3*nstencil)) \
+    out(overflow:length(5) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(tag)
+  #endif
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    overflow[LMP_LOCAL_MIN] = 0;
+    overflow[LMP_LOCAL_MAX] = aend - 1;
+    overflow[LMP_GHOST_MIN] = e_nall;
+    overflow[LMP_GHOST_MAX] = -1;
+    #endif
+
+    int nstencilp = 0;
+    int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
+    for (int k = 0; k < nstencil; k++) {
+      binstart[nstencilp] = stencil[k];
+      int end = stencil[k] + 1;
+      for (int kk = k + 1; kk < nstencil; kk++) {
+        if (stencil[kk-1]+1 == stencil[kk]) {
+          end++;
+          k++;
+        } else break;
+      }
+      binend[nstencilp] = end;
+      nstencilp++;
+    }
+
+    const int mbinyx = mbiny * mbinx;
+
+    #if defined(_OPENMP)
+    #pragma omp parallel
+    #endif
+    {
+      const int num = aend;
+      int tid, ifrom, ito;
+
+      const double balance_factor = 2.0;
+      const double ibalance_factor = 1.0 / balance_factor;
+      const int gnum = num - nlocal;
+      const int wlocal = static_cast<int>(ceil(balance_factor * nlocal));
+      const int snum = wlocal + gnum;
+      IP_PRE_omp_range_id(ifrom, ito, tid, snum, nthreads);
+      if (ifrom < wlocal) ifrom = static_cast<int>(ibalance_factor * ifrom);
+      else ifrom -= wlocal - nlocal;
+      if (ito < wlocal) ito = static_cast<int>(ibalance_factor * ito);
+      else ito -= wlocal - nlocal;
+
+      int e_ito = ito;
+      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
+
+      int which;
+
+      int pack_offset = maxnbors;
+      int ct = (ifrom + tid * 2) * maxnbors;
+      int *neighptr = firstneigh + ct;
+      const int obound = pack_offset + maxnbors * 2;
+
+      const int toffs = tid * ncache_stride;
+      flt_t * _noalias const tx = ncachex + toffs;
+      flt_t * _noalias const ty = ncachey + toffs;
+      flt_t * _noalias const tz = ncachez + toffs;
+      int * _noalias const tj = ncachej + toffs;
+      int * _noalias const tjtype = ncachejtype + toffs;
+      int * _noalias const ttag = ncachetag + toffs;
+
+      // loop over all atoms in other bins in stencil, store every pair
+      int istart, icount, ncount, oldbin = -9999999, lane, max_chunk;
+      for (int i = ifrom; i < ito; i++) {
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const int itype = x[i].w;
+        const tagint itag = tag[i];
+        const int ioffset = ntypes * itype;
+
+        const int ibin = atombin[i];
+        if (ibin != oldbin) {
+          oldbin = ibin;
+          ncount = 0;
+	  if (i < nlocal) {
+	    for (int k = 0; k < nstencilp; k++) {
+	      const int bstart = binhead[ibin + binstart[k]];
+	      const int bend = binhead[ibin + binend[k]];
+              #if defined(LMP_SIMD_COMPILER)
+              #pragma vector aligned
+              #pragma simd
+              #endif
+              for (int jj = bstart; jj < bend; jj++)
+                tj[ncount++] = binpacked[jj];
+	    }
+	  } else {
+	    const int zbin = ibin / mbinyx;
+	    const int zrem = ibin % mbinyx;
+	    const int ybin = zrem / mbinx;
+	    const int xbin = zrem % mbinx;
+	    for (int k = 0; k < nstencil; k++) {
+	      const int xbin2 = xbin + stencilxyz[3 * k + 0];
+	      const int ybin2 = ybin + stencilxyz[3 * k + 1];
+	      const int zbin2 = zbin + stencilxyz[3 * k + 2];
+	      if (xbin2 < 0 || xbin2 >= mbinx ||
+                  ybin2 < 0 || ybin2 >= mbiny ||
+                  zbin2 < 0 || zbin2 >= mbinz) continue;
+
+	      const int bstart = binhead[ibin + stencil[k]];
+	      const int bend = binhead[ibin + stencil[k] + 1];
+              #if defined(LMP_SIMD_COMPILER)
+              #pragma vector aligned
+              #pragma simd
+              #endif
+              for (int jj = bstart; jj < bend; jj++)
+                tj[ncount++] = binpacked[jj];
+	    }
+	  } // if i < nlocal
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #pragma simd
+          #endif
+          for (int u = 0; u < ncount; u++) {
+            const int j = tj[u];
+            tx[u] = x[j].x;
+            ty[u] = x[j].y;
+            tz[u] = x[j].z;
+            tjtype[u] = x[j].w;
+	    ttag[u] = tag[j];
+          }
+	} // if ibin != oldbin
+
+        // ---------------------- Loop over other bins
+
+        int n = maxnbors;
+        int n2 = n * 2;
+	int *neighptr2 = neighptr;
+	const flt_t * _noalias cutsq;
+	if (i < nlocal) cutsq = cutneighsq;
+	else cutsq = cutneighghostsq;
+
+	const int icp = i;
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int u = 0; u < ncount; u++) {
+          int addme = 1;
+          int j = tj[u];
+
+	  if (i == j) addme = 0;
+
+          // Cutoff Check
+          const flt_t delx = xtmp - tx[u];
+          const flt_t dely = ytmp - ty[u];
+          const flt_t delz = ztmp - tz[u];
+          const int jtype = tjtype[u];
+	  const int jtag = ttag[u];
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+          if (rsq > cutsq[ioffset + jtype]) addme = 0;
+
+          if (need_ic && icp < nlocal) {
+            int no_special;
+	    ominimum_image_check(no_special, delx, dely, delz);
+            if (no_special)
+              j = -j - 1;
+          }
+
+	  int flist = 0;
+	  if (itag > jtag) {
+	    if (((itag+jtag) & 1) == 0) flist = 1;
+	  } else if (itag < jtag) {
+	    if (((itag+jtag) & 1) == 1) flist = 1;
+	  } else {
+	    if (tz[u] < ztmp) flist = 1;
+	    else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
+	    else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp)
+	      flist = 1;
+	  }
+	  if (addme) {
+	    if (flist)
+	      neighptr2[n2++] = j;
+	    else
+	      neighptr[n++] = j;
+	  }
+        } // for u
+
+        #ifndef _LMP_INTEL_OFFLOAD
+        if (exclude) {
+          int alln = n;
+          n = maxnbors;
+          for (int u = pack_offset; u < alln; u++) {
+            const int j = neighptr[u];
+            int pj = j;
+            if (need_ic)
+              if (pj < 0) pj = -j - 1;
+            const int jtype = x[pj].w;
+            if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
+            neighptr[n++] = j;
+          }
+	  alln = n2;
+	  n2 = maxnbors * 2;
+	  for (int u = n2; u < alln; u++) {
+	    const int j = neighptr[u];
+	    int pj = j;
+	    if (need_ic)
+	      if (pj < 0) pj = -j - 1;
+	    const int jtype = x[pj].w;
+	    if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
+	    neighptr[n2++] = j;
+	  }
+        }
+        #endif
+        int ns = n - maxnbors;
+	int alln = n;
+	atombin[i] = ns;
+	n = 0;
+	for (int u = maxnbors; u < alln; u++)
+          neighptr[n++] = neighptr[u];
+	ns += n2 - maxnbors * 2;
+	for (int u = maxnbors * 2; u < n2; u++)
+          neighptr[n++] = neighptr[u];
+	if (ns > maxnbors) *overflow = 1;
+
+        ilist[i] = i;
+        cnumneigh[i] = ct;
+        numneigh[i] = ns;
+
+	ct += ns;
+	const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
+	const int edge = ct & (alignb - 1);
+	if (edge) ct += alignb - edge;
+	neighptr = firstneigh + ct;
+	if (ct + obound > list_size) {
+	  if (i < ito - 1) {
+	    *overflow = 1;
+	    ct = (ifrom + tid * 2) * maxnbors;
+	  }
+	}
+      }
+
+      if (*overflow == 1)
+        for (int i = ifrom; i < ito; i++)
+          numneigh[i] = 0;
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      int ghost_offset = 0, nall_offset = e_nall;
+      if (separate_buffers) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          #if __INTEL_COMPILER+0 > 1499
+          #pragma vector aligned
+          #pragma simd
+          #endif
+          for (int jj = 0; jj < jnum; jj++) {
+            int j = jlist[jj];
+            if (need_ic && j < 0) j = -j - 1;
+          }
+        }
+
+	overflow[LMP_LOCAL_MIN] = 0;
+	overflow[LMP_LOCAL_MAX] = nlocal - 1;
+	overflow[LMP_GHOST_MIN] = nlocal;
+	overflow[LMP_GHOST_MAX] = e_nall - 1;
+
+        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
+        if (nghost < 0) nghost = 0;
+        if (offload) {
+          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
+          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
+        } else {
+          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
+          nall_offset = nlocal + nghost;
+        }
+      } // if separate_buffers
+      #endif
+
+      if (molecular) {
+	int ito_m = ito;
+	if (ito >= nlocal) ito_m = nlocal; 
+        for (int i = ifrom; i < ito_m; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #pragma simd
+          #endif
+          for (int jj = 0; jj < jnum; jj++) {
+            const int j = jlist[jj];
+            if (need_ic && j < 0) {
+              which = 0;
+              jlist[jj] = -j - 1;
+            } else
+              ofind_special(which, special, nspecial, i, tag[j]);
+            #ifdef _LMP_INTEL_OFFLOAD
+            if (j >= nlocal) {
+              if (j == e_nall)
+                jlist[jj] = nall_offset;
+              else if (which)
+                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+              else jlist[jj]-=ghost_offset;
+            } else
+            #endif
+            if (which) jlist[jj] = j ^ (which << SBBITS);
+          }
+        } // for i
+      } // if molecular
+      #ifdef _LMP_INTEL_OFFLOAD
+      else if (separate_buffers) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          int jj = 0;
+          #pragma vector aligned
+          #pragma simd
+          for (jj = 0; jj < jnum; jj++) {
+            if (jlist[jj] >= nlocal) {
+              if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
+              else jlist[jj] -= ghost_offset;
+            }
+          }
+        }
+      }
+      #endif
+    } // end omp
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
+    _fix->start_watch(TIME_HOST_NEIGHBOR);
+    for (int n = 0; n < aend; n++) {
+      ilist[n] = n;
+      numneigh[n] = 0;
+    }
+  } else {
+    for (int i = 0; i < aend; i++)
+      list->firstneigh[i] = firstneigh + cnumneigh[i];
+    if (separate_buffers) {
+      _fix->start_watch(TIME_PACK);
+      _fix->set_neighbor_host_sizes();
+      buffers->pack_sep_from_single(_fix->host_min_local(),
+                                    _fix->host_used_local(),
+                                    _fix->host_min_ghost(),
+                                    _fix->host_used_ghost());
+      _fix->stop_watch(TIME_PACK);
+    }
+  }
+  #else
+  #pragma vector aligned
+  #pragma simd
+  for (int i = 0; i < aend; i++)
+    list->firstneigh[i] = firstneigh + cnumneigh[i];
+  #endif
+}
--- a/src/USER-INTEL/npair_full_bin_ghost_intel.h
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.h
@ -0,0 +1,55 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef NPAIR_CLASS
+
+NPairStyle(full/bin/ghost/intel,
+           NPairFullBinGhostIntel,
+           NP_FULL | NP_BIN | NP_GHOST | NP_NEWTON | NP_NEWTOFF | 
+           NP_ORTHO | NP_TRI | NP_INTEL)
+
+#else
+
+#ifndef LMP_NPAIR_FULL_BIN_GHOST_INTEL_H
+#define LMP_NPAIR_FULL_BIN_GHOST_INTEL_H
+
+#include "npair_intel.h"
+
+namespace LAMMPS_NS {
+
+class NPairFullBinGhostIntel : public NPairIntel {
+ public:
+  NPairFullBinGhostIntel(class LAMMPS *);
+  ~NPairFullBinGhostIntel() {}
+  void build(class NeighList *);
+ private:
+  template<class flt_t, class acc_t>
+  void fbi(NeighList * list, IntelBuffers<flt_t,acc_t> * buffers);
+  template<class flt_t, class acc_t, int need_ic>
+  void fbi(const int offload, NeighList * list, 
+	   IntelBuffers<flt_t,acc_t> * buffers, 
+           const int astart, const int aend);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
--- a/src/USER-INTEL/npair_intel.cpp
+++ b/src/USER-INTEL/npair_intel.cpp
@ -143,6 +143,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
  flt_t * _noalias const ncachez = buffers->get_ncachez();
  int * _noalias const ncachej = buffers->get_ncachej();
  int * _noalias const ncachejtype = buffers->get_ncachejtype();
+  int * _noalias const ncachetag = buffers->get_ncachetag();
  const int ncache_stride = buffers->ncache_stride();

  #ifdef _LMP_INTEL_OFFLOAD
@ -165,7 +166,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
    in(atombin:length(aend) alloc_if(0) free_if(0)) \
    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
    in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
-    in(ncachejtype:length(0) alloc_if(0) free_if(0)) \
+    in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \
    in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
    in(pad_width,offload_end,separate_buffers,astart,aend,nlocal,molecular) \
    in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
@ -222,7 +223,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
      ito += astart;
      int e_ito = ito;
      if (THREE && ito == num) {
-        int imod = ito % pack_width;
+        int imod = ito & (pack_width - 1);
        if (imod) e_ito += pack_width - imod;
      }
      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
@ -241,6 +242,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
      flt_t * _noalias const tz = ncachez + toffs;
      int * _noalias const tj = ncachej + toffs;
      int * _noalias const tjtype = ncachejtype + toffs;
+      int * _noalias const ttag = ncachetag + toffs;

      flt_t * _noalias itx;
      flt_t * _noalias ity;
@ -287,13 +289,14 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
            ty[u] = x[j].y;
            tz[u] = x[j].z;
            tjtype[u] = x[j].w;
+	    if (THREE) ttag[u] = tag[j];
          }

          if (FULL == 0 || TRI == 1) {
            icount = 0;
            istart = ncount;
            const int alignb = INTEL_DATA_ALIGN / sizeof(int);
-            int nedge = istart % alignb;
+            int nedge = istart & (alignb - 1);
            if (nedge) istart + (alignb - nedge);
            itx = tx + istart;
            ity = ty + istart;
@ -343,7 +346,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,

            // i bin (half) check and offload ghost check
            if (j < nlocal) {
-              const int ijmod = (i + j) % 2;
+              const int ijmod = (i + j) & 1;
              if (i > j) {
                if (ijmod == 0) addme = 0;
              } else if (i < j) {
@ -424,8 +427,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
          }
          #endif

-          int pj;
-          if (THREE) pj = j;
          if (need_ic) {
            int no_special;
            ominimum_image_check(no_special, delx, dely, delz);
@ -434,12 +435,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
          }

          if (THREE) {
-            const int jtag = tag[pj];
+            const int jtag = ttag[u];
            int flist = 0;
            if (itag > jtag) {
-              if ((itag+jtag) % 2 == 0) flist = 1;
+	      if (((itag+jtag) & 1) == 0) flist = 1;
            } else if (itag < jtag) {
-              if ((itag+jtag) % 2 == 1) flist = 1;
+	      if (((itag+jtag) & 1) == 1) flist = 1;
            } else {
              if (tz[u] < ztmp) flist = 1;
              else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
@ -512,7 +513,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
          cnumneigh[i] += lane;
          numneigh[i] = ns;
        } else {
-          int edge = (n % pad_width);
+          int edge = n & (pad_width - 1);
          if (edge) {
            const int pad_end = n + (pad_width - edge);
            #if defined(LMP_SIMD_COMPILER)
@ -532,7 +533,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
          if (lane == pack_width) {
            ct += max_chunk * pack_width;
            const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-            const int edge = (ct % alignb);
+            const int edge = ct & (alignb - 1);
            if (edge) ct += alignb - edge;
            neighptr = firstneigh + ct;
            max_chunk = 0;
@ -548,7 +549,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
        } else {
          ct += n;
          const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-          const int edge = (ct % alignb);
+          const int edge = ct & (alignb - 1);
          if (edge) ct += alignb - edge;
          neighptr = firstneigh + ct;
          if (ct + obound > list_size) {
--- a/src/USER-INTEL/pair_airebo_intel.cpp
+++ b/src/USER-INTEL/pair_airebo_intel.cpp
--- a/src/USER-INTEL/pair_airebo_intel.h
+++ b/src/USER-INTEL/pair_airebo_intel.h
@ -0,0 +1,110 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(airebo/intel,PairAIREBOIntel)
+
+#else
+
+#ifndef LMP_PAIR_AIREBO_INTEL_H
+#define LMP_PAIR_AIREBO_INTEL_H
+
+#include "pair.h"
+#include "fix_intel.h"
+#include "pair_airebo.h"
+//#include "airebo_common.h"
+
+namespace LAMMPS_NS {
+
+template<class flt_t, class acc_t>
+struct PairAIREBOIntelParam;
+
+class PairAIREBOIntel : public PairAIREBO {
+ public:
+  PairAIREBOIntel(class LAMMPS *);
+  virtual ~PairAIREBOIntel();
+  virtual void compute(int, int);
+  virtual void init_style();
+ protected:
+
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers);
+
+  template <int EVFLAG, int EFLAG, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+	    IntelBuffers<flt_t,acc_t> * buffers,
+	    const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(IntelBuffers<flt_t,acc_t> * buffers);
+
+  template <class flt_t, class acc_t>
+  PairAIREBOIntelParam<flt_t,acc_t> get_param();
+
+  FixIntel * fix;
+  int _cop;
+
+  int * REBO_cnumneigh;
+  int * REBO_num_skin;
+  int * REBO_list_data;
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair style AIREBO requires atom IDs
+
+This is a requirement to use the AIREBO potential.
+
+E: Pair style AIREBO requires newton pair on
+
+See the newton command.  This is a restriction to use the AIREBO
+potential.
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+E: Neighbor list overflow, boost neigh_modify one
+
+There are too many neighbors of a single atom.  Use the neigh_modify
+command to increase the max number of neighbors allowed for one atom.
+You may also want to boost the page size.
+
+E: Cannot open AIREBO potential file %s
+
+The specified AIREBO potential file cannot be opened.  Check that the
+path and name are correct.
+
+*/
--- a/src/USER-INTEL/pair_airebo_morse_intel.cpp
+++ b/src/USER-INTEL/pair_airebo_morse_intel.cpp
@ -0,0 +1,37 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#include "pair_airebo_morse_intel.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairAIREBOMorseIntel::PairAIREBOMorseIntel(LAMMPS *lmp) 
+  : PairAIREBOIntel(lmp) {}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairAIREBOMorseIntel::settings(int narg, char **arg)
+{
+  PairAIREBOIntel::settings(narg,arg);
+
+  morseflag = 1;
+}
--- a/src/USER-INTEL/pair_airebo_morse_intel.h
+++ b/src/USER-INTEL/pair_airebo_morse_intel.h
@ -0,0 +1,40 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(airebo/morse/intel,PairAIREBOMorseIntel)
+
+#else
+
+#ifndef LMP_PAIR_AIREBO_MORSE_INTEL_H
+#define LMP_PAIR_AIREBO_MORSE_INTEL_H
+
+#include "pair_airebo_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairAIREBOMorseIntel : public PairAIREBOIntel {
+ public:
+  PairAIREBOMorseIntel(class LAMMPS *);
+  virtual void settings(int, char **);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-INTEL/pair_eam_alloy_intel.cpp
+++ b/src/USER-INTEL/pair_eam_alloy_intel.cpp
@ -0,0 +1,326 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Stephen Foiles (SNL), Murray Daw (SNL)
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pair_eam_alloy_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MAXLINE 1024
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMAlloyIntel::PairEAMAlloyIntel(LAMMPS *lmp) : PairEAMIntel(lmp)
+{
+  one_coeff = 1;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+   read DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyIntel::coeff(int narg, char **arg)
+{
+  int i,j;
+
+  if (!allocated) allocate();
+
+  if (narg != 3 + atom->ntypes)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // insure I,J args are * *
+
+  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // read EAM setfl file
+
+  if (setfl) {
+    for (i = 0; i < setfl->nelements; i++) delete [] setfl->elements[i];
+    delete [] setfl->elements;
+    delete [] setfl->mass;
+    memory->destroy(setfl->frho);
+    memory->destroy(setfl->rhor);
+    memory->destroy(setfl->z2r);
+    delete setfl;
+  }
+  setfl = new Setfl();
+  read_file(arg[2]);
+
+  // read args that map atom types to elements in potential file
+  // map[i] = which element the Ith atom type is, -1 if NULL
+
+  for (i = 3; i < narg; i++) {
+    if (strcmp(arg[i],"NULL") == 0) {
+      map[i-2] = -1;
+      continue;
+    }
+    for (j = 0; j < setfl->nelements; j++)
+      if (strcmp(arg[i],setfl->elements[j]) == 0) break;
+    if (j < setfl->nelements) map[i-2] = j;
+    else error->all(FLERR,"No matching element in EAM potential file");
+  }
+
+  // clear setflag since coeff() called once with I,J = * *
+
+  int n = atom->ntypes;
+  for (i = 1; i <= n; i++)
+    for (j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  // set setflag i,j for type pairs where both are mapped to elements
+  // set mass of atom type if i = j
+
+  int count = 0;
+  for (i = 1; i <= n; i++) {
+    for (j = i; j <= n; j++) {
+      if (map[i] >= 0 && map[j] >= 0) {
+        setflag[i][j] = 1;
+        if (i == j) atom->set_mass(FLERR,i,setfl->mass[map[i]]);
+        count++;
+      }
+      scale[i][j] = 1.0;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   read a multi-element DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyIntel::read_file(char *filename)
+{
+  Setfl *file = setfl;
+
+  // open potential file
+
+  int me = comm->me;
+  FILE *fptr;
+  char line[MAXLINE];
+
+  if (me == 0) {
+    fptr = force->open_potential(filename);
+    if (fptr == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open EAM potential file %s",filename);
+      error->one(FLERR,str);
+    }
+  }
+
+  // read and broadcast header
+  // extract element names from nelements line
+
+  int n;
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    n = strlen(line) + 1;
+  }
+  MPI_Bcast(&n,1,MPI_INT,0,world);
+  MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+  sscanf(line,"%d",&file->nelements);
+  int nwords = atom->count_words(line);
+  if (nwords != file->nelements + 1)
+    error->all(FLERR,"Incorrect element names in EAM potential file");
+
+  char **words = new char*[file->nelements+1];
+  nwords = 0;
+  strtok(line," \t\n\r\f");
+  while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
+
+  file->elements = new char*[file->nelements];
+  for (int i = 0; i < file->nelements; i++) {
+    n = strlen(words[i]) + 1;
+    file->elements[i] = new char[n];
+    strcpy(file->elements[i],words[i]);
+  }
+  delete [] words;
+
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    sscanf(line,"%d %lg %d %lg %lg",
+           &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
+  }
+
+  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
+  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
+  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
+
+  file->mass = new double[file->nelements];
+  memory->create(file->frho,file->nelements,file->nrho+1,"pair:frho");
+  memory->create(file->rhor,file->nelements,file->nr+1,"pair:rhor");
+  memory->create(file->z2r,file->nelements,file->nelements,file->nr+1,
+                 "pair:z2r");
+
+  int i,j,tmp;
+  for (i = 0; i < file->nelements; i++) {
+    if (me == 0) {
+      fgets(line,MAXLINE,fptr);
+      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
+    }
+    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
+
+    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
+    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
+    if (me == 0) grab(fptr,file->nr,&file->rhor[i][1]);
+    MPI_Bcast(&file->rhor[i][1],file->nr,MPI_DOUBLE,0,world);
+  }
+
+  for (i = 0; i < file->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
+      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+
+  // close the potential file
+
+  if (me == 0) fclose(fptr);
+}
+
+/* ----------------------------------------------------------------------
+   copy read-in setfl potential to standard array format
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyIntel::file2array()
+{
+  int i,j,m,n;
+  int ntypes = atom->ntypes;
+
+  // set function params directly from setfl file
+
+  nrho = setfl->nrho;
+  nr = setfl->nr;
+  drho = setfl->drho;
+  dr = setfl->dr;
+  rhomax = (nrho-1) * drho;
+
+  // ------------------------------------------------------------------
+  // setup frho arrays
+  // ------------------------------------------------------------------
+
+  // allocate frho arrays
+  // nfrho = # of setfl elements + 1 for zero array
+
+  nfrho = setfl->nelements + 1;
+  memory->destroy(frho);
+  memory->create(frho,nfrho,nrho+1,"pair:frho");
+
+  // copy each element's frho to global frho
+
+  for (i = 0; i < setfl->nelements; i++)
+    for (m = 1; m <= nrho; m++) frho[i][m] = setfl->frho[i][m];
+
+  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
+  // this is necessary b/c fp is still computed for non-EAM atoms
+
+  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
+
+  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
+  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
+  // then map it to last frho array of zeroes
+
+  for (i = 1; i <= ntypes; i++)
+    if (map[i] >= 0) type2frho[i] = map[i];
+    else type2frho[i] = nfrho-1;
+
+  // ------------------------------------------------------------------
+  // setup rhor arrays
+  // ------------------------------------------------------------------
+
+  // allocate rhor arrays
+  // nrhor = # of setfl elements
+
+  nrhor = setfl->nelements;
+  memory->destroy(rhor);
+  memory->create(rhor,nrhor,nr+1,"pair:rhor");
+
+  // copy each element's rhor to global rhor
+
+  for (i = 0; i < setfl->nelements; i++)
+    for (m = 1; m <= nr; m++) rhor[i][m] = setfl->rhor[i][m];
+
+  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
+  // for setfl files, I,J mapping only depends on I
+  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
+
+  for (i = 1; i <= ntypes; i++)
+    for (j = 1; j <= ntypes; j++)
+      type2rhor[i][j] = map[i];
+
+  // ------------------------------------------------------------------
+  // setup z2r arrays
+  // ------------------------------------------------------------------
+
+  // allocate z2r arrays
+  // nz2r = N*(N+1)/2 where N = # of setfl elements
+
+  nz2r = setfl->nelements * (setfl->nelements+1) / 2;
+  memory->destroy(z2r);
+  memory->create(z2r,nz2r,nr+1,"pair:z2r");
+
+  // copy each element pair z2r to global z2r, only for I >= J
+
+  n = 0;
+  for (i = 0; i < setfl->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      for (m = 1; m <= nr; m++) z2r[n][m] = setfl->z2r[i][j][m];
+      n++;
+    }
+
+  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
+  // set of z2r arrays only fill lower triangular Nelement matrix
+  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
+  // swap indices when irow < icol to stay lower triangular
+  // if map = -1 (non-EAM atom in pair hybrid):
+  //   type2z2r is not used by non-opt
+  //   but set type2z2r to 0 since accessed by opt
+
+  int irow,icol;
+  for (i = 1; i <= ntypes; i++) {
+    for (j = 1; j <= ntypes; j++) {
+      irow = map[i];
+      icol = map[j];
+      if (irow == -1 || icol == -1) {
+        type2z2r[i][j] = 0;
+        continue;
+      }
+      if (irow < icol) {
+        irow = map[j];
+        icol = map[i];
+      }
+      n = 0;
+      for (m = 0; m < irow; m++) n += m + 1;
+      n += icol;
+      type2z2r[i][j] = n;
+    }
+  }
+}
--- a/src/USER-INTEL/pair_eam_alloy_intel.h
+++ b/src/USER-INTEL/pair_eam_alloy_intel.h
@ -0,0 +1,43 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eam/alloy/intel,PairEAMAlloyIntel)
+
+#else
+
+#ifndef LMP_PAIR_EAM_ALLOY_INTEL_H
+#define LMP_PAIR_EAM_ALLOY_INTEL_H
+
+#include "pair_eam_intel.h"
+
+namespace LAMMPS_NS {
+
+// need virtual public b/c of how eam/alloy/opt inherits from it
+
+class PairEAMAlloyIntel : virtual public PairEAMIntel {
+ public:
+  PairEAMAlloyIntel(class LAMMPS *);
+  virtual ~PairEAMAlloyIntel() {}
+  void coeff(int, char **);
+
+ protected:
+  void read_file(char *);
+  void file2array();
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-INTEL/pair_eam_fs_intel.cpp
+++ b/src/USER-INTEL/pair_eam_fs_intel.cpp
@ -0,0 +1,335 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Tim Lau (MIT)
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pair_eam_fs_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MAXLINE 1024
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMFSIntel::PairEAMFSIntel(LAMMPS *lmp) : PairEAMIntel(lmp)
+{
+  one_coeff = 1;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+   read EAM Finnis-Sinclair file
+------------------------------------------------------------------------- */
+
+void PairEAMFSIntel::coeff(int narg, char **arg)
+{
+  int i,j;
+
+  if (!allocated) allocate();
+
+  if (narg != 3 + atom->ntypes)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // insure I,J args are * *
+
+  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // read EAM Finnis-Sinclair file
+
+  if (fs) {
+    for (i = 0; i < fs->nelements; i++) delete [] fs->elements[i];
+    delete [] fs->elements;
+    delete [] fs->mass;
+    memory->destroy(fs->frho);
+    memory->destroy(fs->rhor);
+    memory->destroy(fs->z2r);
+    delete fs;
+  }
+  fs = new Fs();
+  read_file(arg[2]);
+
+  // read args that map atom types to elements in potential file
+  // map[i] = which element the Ith atom type is, -1 if NULL
+
+  for (i = 3; i < narg; i++) {
+    if (strcmp(arg[i],"NULL") == 0) {
+      map[i-2] = -1;
+      continue;
+    }
+    for (j = 0; j < fs->nelements; j++)
+      if (strcmp(arg[i],fs->elements[j]) == 0) break;
+    if (j < fs->nelements) map[i-2] = j;
+    else error->all(FLERR,"No matching element in EAM potential file");
+  }
+
+  // clear setflag since coeff() called once with I,J = * *
+
+  int n = atom->ntypes;
+  for (i = 1; i <= n; i++)
+    for (j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  // set setflag i,j for type pairs where both are mapped to elements
+  // set mass of atom type if i = j
+
+  int count = 0;
+  for (i = 1; i <= n; i++) {
+    for (j = i; j <= n; j++) {
+      if (map[i] >= 0 && map[j] >= 0) {
+        setflag[i][j] = 1;
+        if (i == j) atom->set_mass(FLERR,i,fs->mass[map[i]]);
+        count++;
+      }
+      scale[i][j] = 1.0;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   read a multi-element DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMFSIntel::read_file(char *filename)
+{
+  Fs *file = fs;
+
+  // open potential file
+
+  int me = comm->me;
+  FILE *fptr;
+  char line[MAXLINE];
+
+  if (me == 0) {
+    fptr = force->open_potential(filename);
+    if (fptr == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open EAM potential file %s",filename);
+      error->one(FLERR,str);
+    }
+  }
+
+  // read and broadcast header
+  // extract element names from nelements line
+
+  int n;
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    n = strlen(line) + 1;
+  }
+  MPI_Bcast(&n,1,MPI_INT,0,world);
+  MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+  sscanf(line,"%d",&file->nelements);
+  int nwords = atom->count_words(line);
+  if (nwords != file->nelements + 1)
+    error->all(FLERR,"Incorrect element names in EAM potential file");
+
+  char **words = new char*[file->nelements+1];
+  nwords = 0;
+  strtok(line," \t\n\r\f");
+  while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
+
+  file->elements = new char*[file->nelements];
+  for (int i = 0; i < file->nelements; i++) {
+    n = strlen(words[i]) + 1;
+    file->elements[i] = new char[n];
+    strcpy(file->elements[i],words[i]);
+  }
+  delete [] words;
+
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    sscanf(line,"%d %lg %d %lg %lg",
+           &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
+  }
+
+  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
+  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
+  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
+
+  file->mass = new double[file->nelements];
+  memory->create(file->frho,file->nelements,file->nrho+1,
+                                              "pair:frho");
+  memory->create(file->rhor,file->nelements,file->nelements,
+                 file->nr+1,"pair:rhor");
+  memory->create(file->z2r,file->nelements,file->nelements,
+                 file->nr+1,"pair:z2r");
+
+  int i,j,tmp;
+  for (i = 0; i < file->nelements; i++) {
+    if (me == 0) {
+      fgets(line,MAXLINE,fptr);
+      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
+    }
+    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
+
+    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
+    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
+
+    for (j = 0; j < file->nelements; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->rhor[i][j][1]);
+      MPI_Bcast(&file->rhor[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+  }
+
+  for (i = 0; i < file->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
+      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+
+  // close the potential file
+
+  if (me == 0) fclose(fptr);
+}
+
+/* ----------------------------------------------------------------------
+   copy read-in setfl potential to standard array format
+------------------------------------------------------------------------- */
+
+void PairEAMFSIntel::file2array()
+{
+  int i,j,m,n;
+  int ntypes = atom->ntypes;
+
+  // set function params directly from fs file
+
+  nrho = fs->nrho;
+  nr = fs->nr;
+  drho = fs->drho;
+  dr = fs->dr;
+  rhomax = (nrho-1) * drho;
+
+  // ------------------------------------------------------------------
+  // setup frho arrays
+  // ------------------------------------------------------------------
+
+  // allocate frho arrays
+  // nfrho = # of fs elements + 1 for zero array
+
+  nfrho = fs->nelements + 1;
+  memory->destroy(frho);
+  memory->create(frho,nfrho,nrho+1,"pair:frho");
+
+  // copy each element's frho to global frho
+
+  for (i = 0; i < fs->nelements; i++)
+    for (m = 1; m <= nrho; m++) frho[i][m] = fs->frho[i][m];
+
+  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
+  // this is necessary b/c fp is still computed for non-EAM atoms
+
+  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
+
+  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
+  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
+  // then map it to last frho array of zeroes
+
+  for (i = 1; i <= ntypes; i++)
+    if (map[i] >= 0) type2frho[i] = map[i];
+    else type2frho[i] = nfrho-1;
+
+  // ------------------------------------------------------------------
+  // setup rhor arrays
+  // ------------------------------------------------------------------
+
+  // allocate rhor arrays
+  // nrhor = square of # of fs elements
+
+  nrhor = fs->nelements * fs->nelements;
+  memory->destroy(rhor);
+  memory->create(rhor,nrhor,nr+1,"pair:rhor");
+
+  // copy each element pair rhor to global rhor
+
+  n = 0;
+  for (i = 0; i < fs->nelements; i++)
+    for (j = 0; j < fs->nelements; j++) {
+      for (m = 1; m <= nr; m++) rhor[n][m] = fs->rhor[i][j][m];
+      n++;
+    }
+
+  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
+  // for fs files, there is a full NxN set of rhor arrays
+  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
+
+  for (i = 1; i <= ntypes; i++)
+    for (j = 1; j <= ntypes; j++)
+      type2rhor[i][j] = map[i] * fs->nelements + map[j];
+
+  // ------------------------------------------------------------------
+  // setup z2r arrays
+  // ------------------------------------------------------------------
+
+  // allocate z2r arrays
+  // nz2r = N*(N+1)/2 where N = # of fs elements
+
+  nz2r = fs->nelements * (fs->nelements+1) / 2;
+  memory->destroy(z2r);
+  memory->create(z2r,nz2r,nr+1,"pair:z2r");
+
+  // copy each element pair z2r to global z2r, only for I >= J
+
+  n = 0;
+  for (i = 0; i < fs->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      for (m = 1; m <= nr; m++) z2r[n][m] = fs->z2r[i][j][m];
+      n++;
+    }
+
+  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
+  // set of z2r arrays only fill lower triangular Nelement matrix
+  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
+  // swap indices when irow < icol to stay lower triangular
+  // if map = -1 (non-EAM atom in pair hybrid):
+  //   type2z2r is not used by non-opt
+  //   but set type2z2r to 0 since accessed by opt
+
+  int irow,icol;
+  for (i = 1; i <= ntypes; i++) {
+    for (j = 1; j <= ntypes; j++) {
+      irow = map[i];
+      icol = map[j];
+      if (irow == -1 || icol == -1) {
+        type2z2r[i][j] = 0;
+        continue;
+      }
+      if (irow < icol) {
+        irow = map[j];
+        icol = map[i];
+      }
+      n = 0;
+      for (m = 0; m < irow; m++) n += m + 1;
+      n += icol;
+      type2z2r[i][j] = n;
+    }
+  }
+}
--- a/src/USER-INTEL/pair_eam_fs_intel.h
+++ b/src/USER-INTEL/pair_eam_fs_intel.h
@ -0,0 +1,43 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eam/fs/intel,PairEAMFSIntel)
+
+#else
+
+#ifndef LMP_PAIR_EAM_FS_INTEL_H
+#define LMP_PAIR_EAM_FS_INTEL_H
+
+#include "pair_eam_intel.h"
+
+namespace LAMMPS_NS {
+
+// need virtual public b/c of how eam/fs/opt inherits from it
+
+class PairEAMFSIntel : virtual public PairEAMIntel {
+ public:
+  PairEAMFSIntel(class LAMMPS *);
+  virtual ~PairEAMFSIntel() {}
+  void coeff(int, char **);
+
+ protected:
+  void read_file(char *);
+  void file2array();
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@ -428,7 +428,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
          } else
            multiple_forms = true;
        }
-        const int edge = (packed_j % pad_width);
+        const int edge = packed_j & (pad_width - 1);
        if (edge) {
          const int packed_end = packed_j + (pad_width - edge);
          #if defined(LMP_SIMD_COMPILER)
--- a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
@ -0,0 +1,595 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include "pair_lj_charmm_coul_charmm_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "group.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define LJ_T typename IntelBuffers<flt_t,flt_t>::vec4_t
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmIntel::PairLJCharmmCoulCharmmIntel(LAMMPS *lmp) :
+  PairLJCharmmCoulCharmm(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmIntel::~PairLJCharmmCoulCharmmIntel()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision()==FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+template <class flt_t, class acc_t>
+void PairLJCharmmCoulCharmmIntel::compute(int eflag, int vflag,
+                                        IntelBuffers<flt_t,acc_t> *buffers,
+                                        const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
+    #if defined(_OPENMP)
+    #pragma omp parallel if(packthreads > 1)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
+                                packthreads, sizeof(ATOM_T));
+      buffers->thr_pack(ifrom,ito,ago);
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+
+  // -------------------- Regular version
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    if (force->newton_pair) {
+      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
+    } else {
+      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
+    }
+  } else {
+    if (force->newton_pair) {
+      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
+    } else {
+      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
+                                     IntelBuffers<flt_t,acc_t> *buffers,
+                                     const ForceConst<flt_t> &fc,
+                                     const int astart, const int aend)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  flt_t * _noalias const q = buffers->get_q(offload);
+
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+
+  const flt_t * _noalias const special_coul = fc.special_coul;
+  const flt_t * _noalias const special_lj = fc.special_lj;
+  const flt_t qqrd2e = force->qqrd2e;
+  const flt_t inv_denom_lj = (flt_t)1.0/denom_lj;
+  const flt_t inv_denom_coul = (flt_t)1.0/denom_coul;
+
+  const flt_t * _noalias const cutsq = fc.cutsq[0];
+  const LJ_T * _noalias const lj = fc.lj[0];
+  const flt_t cut_ljsq = fc.cut_ljsq;
+  const flt_t cut_lj_innersq = fc.cut_lj_innersq;
+  const flt_t cut_coul_innersq = fc.cut_coul_innersq;
+  const flt_t cut_coulsq = fc.cut_coulsq;
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  flt_t * _noalias const ccachex = buffers->get_ccachex();
+  flt_t * _noalias const ccachey = buffers->get_ccachey();
+  flt_t * _noalias const ccachez = buffers->get_ccachez();
+  flt_t * _noalias const ccachew = buffers->get_ccachew();
+  int * _noalias const ccachei = buffers->get_ccachei();
+  int * _noalias const ccachej = buffers->get_ccachej();
+  const int ccache_stride = _ccache_stride;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+
+  const int nthreads = tc;
+  #ifdef _LMP_INTEL_OFFLOAD
+  int *overflow = fix->get_off_overflow_flag();
+  double *timer_compute = fix->off_watch_pair();
+
+  if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
+  #pragma offload target(mic:_cop) if(offload) \
+    in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
+    in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(q:length(q_size) alloc_if(0) free_if(0)) \
+    in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
+    in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
+    in(ccache_stride,nthreads,qqrd2e,inum,nall,ntypes,cut_coulsq) \
+    in(vflag,eatom,f_stride,separate_flag,offload) \
+    in(astart,cut_ljsq,cut_lj_innersq,nlocal,inv_denom_lj,minlocal) \
+    in(inv_denom_coul,cut_coul_innersq) \
+    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
+    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(f_start)
+  #endif
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
+                              f_stride, x, q);
+
+    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EFLAG) oevdwl = oecoul = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      flt_t cutboth = cut_coulsq;
+
+      const int toffs = tid * ccache_stride;
+      flt_t * _noalias const tdelx = ccachex + toffs;
+      flt_t * _noalias const tdely = ccachey + toffs;
+      flt_t * _noalias const tdelz = ccachez + toffs;
+      flt_t * _noalias const trsq = ccachew + toffs;
+      int * _noalias const tj = ccachei + toffs;
+      int * _noalias const tjtype = ccachej + toffs;
+
+      for (int i = iifrom; i < iito; i += iip) {
+        //        const int i = ilist[ii];
+        const int itype = x[i].w;
+
+        const int ptr_off = itype * ntypes;
+        const flt_t * _noalias const cutsqi = cutsq + ptr_off;
+        const LJ_T * _noalias const lji = lj + ptr_off;
+
+        const int   * _noalias const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        acc_t fxtmp,fytmp,fztmp,fwtmp;
+        acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const flt_t qtmp = q[i];
+        fxtmp = fytmp = fztmp = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+
+        int ej = 0;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int j = jlist[jj] & NEIGHMASK;
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+
+          if (rsq < cut_coulsq) {
+            trsq[ej]=rsq;
+            tdelx[ej]=delx;
+            tdely[ej]=dely;
+            tdelz[ej]=delz;
+            tjtype[ej]=x[j].w;
+            tj[ej]=jlist[jj];
+            ej++;
+          }
+        }
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
+                               sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
+        for (int jj = 0; jj < ej; jj++) {
+          flt_t forcecoul, forcelj, evdwl;
+          forcecoul = forcelj = evdwl = (flt_t)0.0;
+
+          const int j = tj[jj] & NEIGHMASK;
+          const int sbindex = tj[jj] >> SBBITS & 3;
+          const flt_t rsq = trsq[jj];
+          const flt_t r2inv = (flt_t)1.0 / rsq;
+	  const flt_t r_inv = (flt_t)1.0 / sqrt(rsq);
+	  forcecoul = qqrd2e * qtmp * q[j] * r_inv;
+	  if (rsq > cut_coul_innersq) {
+	    const flt_t ccr = cut_coulsq - rsq;
+	    const flt_t switch1 = ccr * ccr * inv_denom_coul *
+              (cut_coulsq + (flt_t)2.0 * rsq - (flt_t)3.0 * cut_coul_innersq);
+            forcecoul *= switch1; 
+          }
+
+          #ifdef INTEL_VMASK
+          if (rsq < cut_ljsq) {
+          #endif
+	    const int jtype = tjtype[jj];
+            flt_t r6inv = r2inv * r2inv * r2inv;
+            forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
+            if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
+
+            #ifdef INTEL_VMASK
+            if (rsq > cut_lj_innersq) {
+            #endif
+              const flt_t drsq = cut_ljsq - rsq;
+              const flt_t cut2 = (rsq - cut_lj_innersq) * drsq;
+              const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) *
+                  inv_denom_lj;
+              const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj;
+              if (EFLAG) {
+                #ifndef INTEL_VMASK
+                if (rsq > cut_lj_innersq) {
+                #endif
+                  forcelj = forcelj * switch1 + evdwl * switch2;
+                  evdwl *= switch1;
+                #ifndef INTEL_VMASK
+                }
+                #endif
+              } else {
+                const flt_t philj = r6inv * (lji[jtype].z*r6inv -
+                    lji[jtype].w);
+                #ifndef INTEL_VMASK
+                if (rsq > cut_lj_innersq)
+                #endif
+                  forcelj =  forcelj * switch1 + philj * switch2;
+              }
+            #ifdef INTEL_VMASK
+            }
+            #endif
+
+          #ifdef INTEL_VMASK
+          }
+          #else
+          if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
+          #endif
+	  if (sbindex) {
+  	    const flt_t factor_coul = special_coul[sbindex];
+	    forcecoul *= factor_coul;
+	    const flt_t factor_lj = special_lj[sbindex];
+	    forcelj *= factor_lj;
+	    if (EFLAG) evdwl *= factor_lj;
+          }
+
+          const flt_t fpair = (forcecoul + forcelj) * r2inv;
+          const flt_t fpx = fpair * tdelx[jj];
+          fxtmp += fpx;
+          if (NEWTON_PAIR) f[j].x -= fpx;
+          const flt_t fpy = fpair * tdely[jj];
+          fytmp += fpy;
+          if (NEWTON_PAIR) f[j].y -= fpy;
+          const flt_t fpz = fpair * tdelz[jj];
+          fztmp += fpz;
+          if (NEWTON_PAIR) f[j].z -= fpz;
+
+          if (EFLAG) {
+            sevdwl += evdwl;
+            secoul += forcecoul;
+            if (eatom) {
+              fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * forcecoul;
+              if (NEWTON_PAIR)
+                f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * forcecoul;
+            }
+          }
+          if (NEWTON_PAIR == 0)
+            IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
+                                  fpx, fpy, fpz);
+        } // for jj
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
+        IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
+      } // for ii
+
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
+    } // end of omp parallel region
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
+        oecoul *= (acc_t)0.5;
+      }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
+      }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
+    }
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end of offload region
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EFLAG || vflag)
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmIntel::init_style()
+{
+  PairLJCharmmCoulCharmm::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  fix->pair_init_check();
+  #ifdef _LMP_INTEL_OFFLOAD
+  _cop = fix->coprocessor_number();
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+template <class flt_t, class acc_t>
+void PairLJCharmmCoulCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                          IntelBuffers<flt_t,acc_t> *buffers)
+{
+  int off_ccache = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop >= 0) off_ccache = 1;
+  #endif
+  buffers->grow_ccache(off_ccache, comm->nthreads, 1);
+  _ccache_stride = buffers->ccache_stride();
+
+  int tp1 = atom->ntypes + 1;
+
+  fc.set_ntypes(tp1, memory, _cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  if (cut_lj > cut_coul)
+    error->all(FLERR,
+         "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i, j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+      }
+    }
+  }
+
+  cut_coul_innersq = cut_coul_inner * cut_coul_inner;
+  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
+  cut_ljsq = cut_lj * cut_lj;
+  cut_coulsq = cut_coul * cut_coul;
+  cut_bothsq = MAX(cut_ljsq, cut_coulsq);
+
+  fc.cut_coulsq = cut_coulsq;
+  fc.cut_ljsq = cut_ljsq;
+  fc.cut_coul_innersq = cut_coul_innersq;
+  fc.cut_lj_innersq = cut_lj_innersq;
+
+  for (int i = 0; i < 4; i++) {
+    fc.special_lj[i] = force->special_lj[i];
+    fc.special_coul[i] = force->special_coul[i];
+    fc.special_coul[0] = 1.0;
+    fc.special_lj[0] = 1.0;
+  }
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.lj[i][j].x = lj1[i][j];
+      fc.lj[i][j].y = lj2[i][j];
+      fc.lj[i][j].z = lj3[i][j];
+      fc.lj[i][j].w = lj4[i][j];
+      fc.cutsq[i][j] = cutsq[i][j];
+    }
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop < 0) return;
+  flt_t * special_lj = fc.special_lj;
+  flt_t * special_coul = fc.special_coul;
+  flt_t * cutsq = fc.cutsq[0];
+  LJ_T * lj = fc.lj[0];
+  flt_t * ocutneighsq = cutneighsq[0];
+  int tp1sq = tp1 * tp1;
+  #pragma offload_transfer target(mic:_cop) \
+    in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
+    in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \
+    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairLJCharmmCoulCharmmIntel::ForceConst<flt_t>::set_ntypes(
+  const int ntypes, Memory *memory, const int cop) {
+  if (ntypes != _ntypes) {
+    if (_ntypes > 0) {
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      flt_t * ospecial_coul = special_coul;
+      flt_t * ocutsq = cutsq[0];
+      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
+      if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
+          ospecial_coul != NULL && cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
+          nocopy(ocutsq, olj: alloc_if(0) free_if(1))
+      }
+      #endif
+
+      _memory->destroy(cutsq);
+      _memory->destroy(lj);
+    }
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(cutsq,ntypes,ntypes,"fc.cutsq");
+      memory->create(lj,ntypes,ntypes,"fc.lj");
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      flt_t * ospecial_coul = special_coul;
+      flt_t * ocutsq = cutsq[0];
+      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
+      int tp1sq = ntypes*ntypes;
+      if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
+          ospecial_coul != NULL && cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
+          nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
+          nocopy(ocutsq,olj: length(tp1sq) alloc_if(1) free_if(0))
+      }
+      #endif
+    }
+  }
+  _ntypes=ntypes;
+  _memory=memory;
+}
--- a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.h
+++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.h
@ -0,0 +1,100 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/charmm/intel,PairLJCharmmCoulCharmmIntel)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_INTEL_H
+#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_INTEL_H
+
+#include "pair_lj_charmm_coul_charmm.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulCharmmIntel : public PairLJCharmmCoulCharmm {
+
+ public:
+  PairLJCharmmCoulCharmmIntel(class LAMMPS *);
+  virtual ~PairLJCharmmCoulCharmmIntel();
+
+  virtual void compute(int, int);
+  void init_style();
+
+  typedef struct { float x,y,z; int w; } sng4_t;
+
+ private:
+  FixIntel *fix;
+  int _cop, _ccache_stride;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  // ----------------------------------------------------------------------
+  template <class flt_t>
+  class ForceConst {
+   public:
+    _alignvar(flt_t special_coul[4],64);
+    _alignvar(flt_t special_lj[4],64);
+    flt_t **cutsq;
+    flt_t cut_coulsq, cut_ljsq;
+    flt_t cut_coul_innersq, cut_lj_innersq;
+    typename IntelBuffers<flt_t,flt_t>::vec4_t **lj;
+
+    ForceConst() : _ntypes(0) {}
+    ~ForceConst() { set_ntypes(0,NULL,_cop); }
+
+    void set_ntypes(const int ntypes, Memory *memory, const int cop);
+
+   private:
+    int _ntypes, _cop;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+E: Intel varient of lj/charmm/coul/charmm expects lj cutoff<=coulombic
+
+The intel accelerated version of the CHARMM style requires that the
+Lennard-Jones cutoff is not greater than the coulombic cutoff.
+
+*/
--- a/src/USER-INTEL/pair_rebo_intel.cpp
+++ b/src/USER-INTEL/pair_rebo_intel.cpp
@ -0,0 +1,42 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#include "pair_rebo_intel.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairREBOIntel::PairREBOIntel(LAMMPS *lmp) : PairAIREBOIntel(lmp) {}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairREBOIntel::settings(int narg, char **arg)
+{
+  if (narg != 0) error->all(FLERR,"Illegal pair_style command");
+
+  cutlj = 0.0;
+  ljflag = torflag = 0;
+  //
+  // this one parameter for C-C interactions is different in REBO vs AIREBO
+  // see Favata, Micheletti, Ryu, Pugno, Comp Phys Comm (2016)
+  
+  PCCf_2_0 = 0.0;
+}
--- a/src/USER-INTEL/pair_rebo_intel.h
+++ b/src/USER-INTEL/pair_rebo_intel.h
@ -0,0 +1,40 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(rebo/intel,PairREBOIntel)
+
+#else
+
+#ifndef LMP_PAIR_REBO_INTEL_H
+#define LMP_PAIR_REBO_INTEL_H
+
+#include "pair_airebo_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairREBOIntel : public PairAIREBOIntel {
+ public:
+  PairREBOIntel(class LAMMPS *);
+  virtual void settings(int, char **);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-INTEL/pair_sw_intel.cpp
+++ b/src/USER-INTEL/pair_sw_intel.cpp
@ -345,16 +345,17 @@ void PairSWIntel::eval(const int offload, const int vflag,
            if (jj < jnumhalf) ejnumhalf++;
          }
        }
-        int ejnum_pad = ejnum;

-        while ( (ejnum_pad % pad_width) != 0) {
-          tdelx[ejnum_pad] = (flt_t)0.0;
-          tdely[ejnum_pad] = (flt_t)0.0;
-          tdelz[ejnum_pad] = (flt_t)0.0;
-          trsq[ejnum_pad] = p2[3].cutsq + (flt_t)1.0;
-          tj[ejnum_pad] = nall;
-          if (!ONETYPE) tjtype[ejnum_pad] = 0;
-          ejnum_pad++;
+	int ejrem = ejnum & (pad_width - 1);
+	if (ejrem) ejrem = pad_width - ejrem;
+	const int ejnum_pad = ejnum + ejrem;
+	for (int jj = ejnum; jj < ejnum_pad; jj++) {
+          tdelx[jj] = (flt_t)0.0;
+          tdely[jj] = (flt_t)0.0;
+          tdelz[jj] = (flt_t)0.0;
+          trsq[jj] = p2[3].cutsq + (flt_t)1.0;
+          tj[jj] = nall;
+          if (!ONETYPE) tjtype[jj] = 0;
        }

        #if defined(LMP_SIMD_COMPILER)