USER-INTEL add-ons from Mike

2017-08-15 17:12:07 -06:00 · 2017-08-15 17:12:07 -06:00 · 1d4d2155a2
parent 0b3f1b8a15
commit 1d4d2155a2
35 changed files with 9675 additions and 49 deletions
--- a/doc/src/JPG/user_intel.png
+++ b/doc/src/JPG/user_intel.png
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@ -29,8 +29,10 @@ Bond Styles: fene, harmonic :l
 Dihedral Styles: charmm, harmonic, opls :l
 Fixes: nve, npt, nvt, nvt/sllod :l
 Improper Styles: cvff, harmonic :l
-Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne,
+Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, 
-charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, sw, tersoff :l
+buck, eam, eam/alloy, eam/fs, gayberne, charmm/coul/charmm, 
 charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
 sw, tersoff :l
 K-Space Styles: pppm, pppm/disp :l
 :ule
--- a/doc/src/pair_airebo.txt
+++ b/doc/src/pair_airebo.txt
@ -7,10 +7,13 @@
 :line
 pair_style airebo command :h3
 pair_style airebo/intel command :h3
 pair_style airebo/omp command :h3
 pair_style airebo/morse command :h3
 pair_style airebo/morse/intel command :h3
 pair_style airebo/morse/omp command :h3
 pair_style rebo command :h3
 pair_style rebo/intel command :h3
 pair_style rebo/omp command :h3
 [Syntax:]
--- a/doc/src/pair_charmm.txt
+++ b/doc/src/pair_charmm.txt
@ -7,6 +7,7 @@
 :line
 pair_style lj/charmm/coul/charmm command :h3
 pair_style lj/charmm/coul/charmm/intel command :h3
 pair_style lj/charmm/coul/charmm/omp command :h3
 pair_style lj/charmm/coul/charmm/implicit command :h3
 pair_style lj/charmm/coul/charmm/implicit/omp command :h3
--- a/doc/src/pair_eam.txt
+++ b/doc/src/pair_eam.txt
@ -14,6 +14,7 @@ pair_style eam/omp command :h3
 pair_style eam/opt command :h3
 pair_style eam/alloy command :h3
 pair_style eam/alloy/gpu command :h3
 pair_style eam/alloy/intel command :h3
 pair_style eam/alloy/kk command :h3
 pair_style eam/alloy/omp command :h3
 pair_style eam/alloy/opt command :h3
@ -21,6 +22,7 @@ pair_style eam/cd command :h3
 pair_style eam/cd/omp command :h3
 pair_style eam/fs command :h3
 pair_style eam/fs/gpu command :h3
 pair_style eam/fs/intel command :h3
 pair_style eam/fs/kk command :h3
 pair_style eam/fs/omp command :h3
 pair_style eam/fs/opt command :h3
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
@ -14,7 +14,7 @@ SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 LINK =		mpiicpc
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
 LIB =           -ltbbmalloc
 SIZE =		size
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
@ -7,7 +7,7 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 CC =		mpicxx -cxx=icc
-OPTFLAGS =      -xAVX -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
+OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
 CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
                -fno-alias -ansi-alias -restrict $(OPTFLAGS)
 SHFLAGS =	-fPIC
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
@ -8,7 +8,7 @@ SHELL = /bin/sh
 export OMPI_CXX = icc
 CC =		mpicxx
-OPTFLAGS =      -xAVX -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
+OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
 CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
                -fno-alias -ansi-alias -restrict $(OPTFLAGS)
 SHFLAGS =	-fPIC
--- a/src/USER-INTEL/Install.sh
+++ b/src/USER-INTEL/Install.sh
@ -46,6 +46,7 @@ action npair_intel.h
 action npair_intel.cpp
 action intel_simd.h pair_sw_intel.cpp
 action intel_intrinsics.h pair_tersoff_intel.cpp
 action intel_intrinsics_airebo.h pair_airebo_intel.cpp
 action verlet_lrt_intel.h pppm.cpp
 action verlet_lrt_intel.cpp pppm.cpp
--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@ -4,9 +4,9 @@
                     --------------------------------
             W. Michael Brown (Intel) michael.w.brown at intel.com
                  Markus Hohnerbach (RWTH Aachen University)
                   William McDoniel (RWTH Aachen University)
                   Rodrigo Canales (RWTH Aachen University)
                  Markus H<>hnerbach (RWTH Aachen University)
                           Stan Moore (Sandia)
 		   Ahmed E. Ismail (RWTH Aachen University)
                   Paolo Bientinesi (RWTH Aachen University)
--- a/src/USER-INTEL/TEST/README
+++ b/src/USER-INTEL/TEST/README
@ -8,6 +8,7 @@
 # in.intel.sw -	        Silicon benchmark with Stillinger-Weber
 # in.intel.tersoff -    Silicon benchmark with Tersoff
 # in.intel.water -      Coarse-grain water benchmark using Stillinger-Weber
 # in.intel.airebo -     Polyethelene benchmark with AIREBO
 #
 #############################################################################
@ -24,6 +25,7 @@
 # in.intel.sw -	           132.4               161.9
 # in.intel.tersoff -        83.3               101.1
 # in.intel.water -          53.4                90.3
 # in.intel.airebo -          7.3                11.8
 #
 #############################################################################
--- a/src/USER-INTEL/TEST/in.intel.airebo
+++ b/src/USER-INTEL/TEST/in.intel.airebo
@ -0,0 +1,47 @@
 # AIREBO polyethelene benchmark
 variable        N index on      # Newton Setting
 variable	w index 10	# Warmup Timesteps
 variable	t index 550	# Main Run Timesteps
 variable	m index 1	# Main Run Timestep Multiplier
 variable	n index 0	# Use NUMA Mapping for Multi-Node
 variable	p index 0	# Use Power Measurement
 variable	x index 4
 variable	y index 2
 variable	z index 2
 variable	xx equal 17*$x
 variable	yy equal 16*$y
 variable	zz equal 2*$z
 variable	rr equal floor($t*$m)
 variable        root getenv LMP_ROOT
 newton          $N
 if "$n > 0"	then "processors * * * grid numa"
 variable            root getenv LMP_ROOT
 units		    metal
 atom_style	    atomic
 read_data	    ${root}/examples/airebo/data.airebo
 replicate	    ${xx} ${yy} ${zz}
 neighbor	    0.5 bin
 neigh_modify	    delay 5 every 1
 pair_style	    airebo 3.0 1 1
 pair_coeff	    * * ${root}/potentials/CH.airebo C H
 velocity	    all create 300.0 761341
 fix		    1 all nve
 timestep	    0.0005
 thermo		    50
 if "$p > 0"	then "run_style verlet/power"
 if "$w > 0"	then "run $w"
 run		${rr}
--- a/src/USER-INTEL/TEST/in.intel.eam
+++ b/src/USER-INTEL/TEST/in.intel.eam
@ -5,7 +5,6 @@ variable	w index 10      # Warmup Timesteps
 variable	t index 3100    # Main Run Timesteps
 variable	m index 1       # Main Run Timestep Multiplier
 variable	n index 0       # Use NUMA Mapping for Multi-Node
 variable	b index 3       # Neighbor binsize
 variable	p index 0       # Use Power Measurement
 variable	x index 4
--- a/src/USER-INTEL/TEST/in.intel.rhodo
+++ b/src/USER-INTEL/TEST/in.intel.rhodo
@ -5,7 +5,6 @@ variable	w index 10	# Warmup Timesteps
 variable	t index 520	# Main Run Timesteps
 variable	m index 1	# Main Run Timestep Multiplier
 variable	n index 0	# Use NUMA Mapping for Multi-Node
 variable        b index 3       # Neighbor binsize
 variable	p index 0	# Use Power Measurement
 variable	c index 0	# 1 to use collectives for PPPM
 variable        d index 1       # 1 to use 'diff ad' for PPPM
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@ -30,6 +30,9 @@ IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
  _off_map_listlocal = 0;
  _ccachex = 0;
  _ncache_alloc = 0;
  _ncachetag = 0;
  _cutneighsq = 0;
  _cutneighghostsq = 0;
  #ifdef _LMP_INTEL_OFFLOAD
  _separate_buffers = 0;
  _off_f = 0;
@ -447,12 +450,17 @@ void IntelBuffers<flt_t, acc_t>::free_ncache()
    flt_t *ncachez = _ncachez;
    int *ncachej = _ncachej;
    int *ncachejtype = _ncachejtype;
    int *ncachetag = _ncachetag;
    #ifdef _LMP_INTEL_OFFLOAD
    if (_off_ncache) {
      #pragma offload_transfer target(mic:_cop) \
        nocopy(ncachex,ncachey,ncachez,ncachej:alloc_if(0) free_if(1)) \
        nocopy(ncachejtype:alloc_if(0) free_if(1))
      if (ncachetag) {
        #pragma offload_transfer target(mic:_cop) \
          nocopy(ncachetag:alloc_if(0) free_if(1))
      }
    }
    _off_ncache = 0;
    #endif
@ -462,8 +470,10 @@ void IntelBuffers<flt_t, acc_t>::free_ncache()
    lmp->memory->destroy(ncachez);
    lmp->memory->destroy(ncachej);
    lmp->memory->destroy(ncachejtype);
-
+    if (ncachetag)
      lmp->memory->destroy(ncachetag);
    _ncache_alloc = 0;
    _ncachetag = 0;
  }
 }
@ -480,7 +490,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
  const int vsize = _ncache_stride * nt;
  if (_ncache_alloc) {
-    if (vsize > _ncache_alloc)
+    if (vsize > _ncache_alloc || (need_tag() && _ncachetag == 0))
      free_ncache();
    #ifdef _LMP_INTEL_OFFLOAD
    else if (off_flag && _off_ncache == 0)
@ -495,6 +505,8 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
  lmp->memory->create(_ncachez, vsize, "_ncachez");
  lmp->memory->create(_ncachej, vsize, "_ncachej");
  lmp->memory->create(_ncachejtype, vsize, "_ncachejtype");
  if (need_tag())
    lmp->memory->create(_ncachetag, vsize, "_ncachetag");
  _ncache_alloc = vsize;
@ -513,6 +525,14 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
        nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \
        nocopy(ncachejtype:length(vsize) alloc_if(1) free_if(0))
    }
    int tsize = vsize;
    if (!need_tag()) {
      tsize = 16;
      lmp->memory->create(_ncachetag, tsize, "_ncachetag");
    }
    int *ncachetag = _ncachetag;
    #pragma offload_transfer target(mic:_cop)			\
      nocopy(ncachetag:length(tsize) alloc_if(1) free_if(0))
    _off_ncache = 1;
  }
  #endif
@ -548,7 +568,8 @@ void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall,
 /* ---------------------------------------------------------------------- */
 template <class flt_t, class acc_t>
-void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
+void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes, 
 					    const int use_ghost_cut)
 {
  if (ntypes != _ntypes) {
    if (_ntypes > 0) {
@ -558,16 +579,34 @@ void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
        #pragma offload_transfer target(mic:_cop) \
          nocopy(cutneighsqo:alloc_if(0) free_if(1))
      }
      flt_t * cutneighghostsqo;
      if (_cutneighghostsq && _off_threads > 0 && cutneighghostsqo != 0) {
 	cutneighghostsqo = _cutneighghostsq[0];
        #pragma offload_transfer target(mic:_cop) \
          nocopy(cutneighghostsqo:alloc_if(0) free_if(1))
      }
      #endif
      lmp->memory->destroy(_cutneighsq);
      if (_cutneighghostsq != 0) lmp->memory->destroy(_cutneighghostsq);
    }
    if (ntypes > 0) {
      lmp->memory->create(_cutneighsq, ntypes, ntypes, "_cutneighsq");
      if (use_ghost_cut)
 	lmp->memory->create(_cutneighghostsq, ntypes, ntypes, 
 			    "_cutneighghostsq");
      #ifdef _LMP_INTEL_OFFLOAD
      flt_t * cutneighsqo = _cutneighsq[0];
      const int ntypes2 = ntypes * ntypes;
      if (_off_threads > 0 && cutneighsqo != NULL) {
        #pragma offload_transfer target(mic:_cop) \
-          nocopy(cutneighsqo:length(ntypes * ntypes) alloc_if(1) free_if(0))
+          nocopy(cutneighsqo:length(ntypes2) alloc_if(1) free_if(0))
      }
      if (use_ghost_cut) {
        flt_t * cutneighghostsqo = _cutneighghostsq[0];
        if (_off_threads > 0 && cutneighghostsqo != NULL) {
          #pragma offload_transfer target(mic:_cop) \
            nocopy(cutneighghostsqo:length(ntypes2) alloc_if(1) free_if(0))
        }
      }
      #endif
    }
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@ -109,12 +109,14 @@ class IntelBuffers {
  void free_ncache();
  void grow_ncache(const int off_flag, const int nthreads);
  void grow_ncachetag(const int off_flag, const int nthreads);
  inline int ncache_stride() { return _ncache_stride; }
  inline flt_t * get_ncachex() { return _ncachex; }
  inline flt_t * get_ncachey() { return _ncachey; }
  inline flt_t * get_ncachez() { return _ncachez; }
  inline int * get_ncachej() { return _ncachej; }
  inline int * get_ncachejtype() { return _ncachejtype; }
  inline int * get_ncachetag() { return _ncachetag; }
  inline int get_max_nbors() {
    int mn = lmp->neighbor->oneatom * sizeof(int) /
@ -131,7 +133,7 @@ class IntelBuffers {
      _grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
  }
-  void set_ntypes(const int ntypes);
+  void set_ntypes(const int ntypes, const int use_ghost_cut = 0);
  inline int * firstneigh(const NeighList *list) { return _list_alloc; }
  inline int * cnumneigh(const NeighList *list) { return _cnumneigh; }
@ -162,6 +164,7 @@ class IntelBuffers {
  inline void zero_ev()
    { for (int i = 0; i < 8; i++) _ev_global[i] = _ev_global_host[i] = 0.0; }
  inline flt_t ** get_cutneighsq() { return _cutneighsq; }
  inline flt_t ** get_cutneighghostsq() { return _cutneighghostsq; }
  inline int get_off_threads() { return _off_threads; }
  #ifdef _LMP_INTEL_OFFLOAD
  inline void set_off_params(const int n, const int cop,
@ -274,13 +277,10 @@ class IntelBuffers {
             used_ghost * sizeof(flt_t));
    }
  }
  #endif
  inline int need_tag() { return _need_tag; }
  inline void need_tag(const int nt) { _need_tag = nt; }
  #else
  inline int need_tag() { return 0; }
  inline void need_tag(const int nt) { }
  #endif
  double memory_usage(const int nthreads);
@ -298,7 +298,7 @@ class IntelBuffers {
  int _list_alloc_atoms;
  int *_list_alloc, *_cnumneigh, *_atombin, *_binpacked;
-  flt_t **_cutneighsq;
+  flt_t **_cutneighsq, **_cutneighghostsq;
  int _ntypes;
  int _ccache_stride;
@ -307,7 +307,10 @@ class IntelBuffers {
  int _ncache_stride, _ncache_alloc;
  flt_t *_ncachex, *_ncachey, *_ncachez;
-  int *_ncachej, *_ncachejtype;
+  int *_ncachej, *_ncachejtype, *_ncachetag;
  int _need_tag, _host_nmax;
  #ifdef LMP_USE_AVXCD
  int _ccache_stride3;
  acc_t * _ccachef;
@ -324,7 +327,6 @@ class IntelBuffers {
  int *_off_map_special, *_off_map_nspecial, *_off_map_tag;
  int *_off_map_numneigh;
  bool _off_list_alloc;
  int _need_tag, _host_nmax;
  #endif
  int _buf_size, _buf_local_size;
--- a/src/USER-INTEL/intel_intrinsics_airebo.h
+++ b/src/USER-INTEL/intel_intrinsics_airebo.h
--- a/src/USER-INTEL/nbin_intel.cpp
+++ b/src/USER-INTEL/nbin_intel.cpp
@ -211,6 +211,8 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
    for (i = nall-1; i >= nlocal; i--) {
      if (mask[i] & bitmask) {
        ibin = coord2bin(atom->x[i]);
 	// Only necessary to store when neighboring ghost
 	atombin[i] = ibin;
        bins[i] = binhead[ibin];
        binhead[ibin] = i;
      }
@ -222,14 +224,10 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
      binhead[ibin] = i;
    }
  } else {
-    for (i = nall-1; i >= nlocal; i--) {
+    for (i = nall-1; i >= 0; i--) {
      ibin = coord2bin(atom->x[i]);
-      bins[i] = binhead[ibin];
+      // Only necessary to store for ghost when neighboring ghost
-      binhead[ibin] = i;
+      atombin[i] = ibin;
    }
    for (i = nlocal-1; i >= 0; i--) {
      ibin = coord2bin(atom->x[i]);
      atombin[i]=ibin;
      bins[i] = binhead[ibin];
      binhead[ibin] = i;
    }
--- a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
@ -0,0 +1,593 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 #include "npair_full_bin_ghost_intel.h"
 #include "neighbor.h"
 #include "nstencil.h"
 #include "neigh_list.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "molecule.h"
 #include "error.h"
 using namespace LAMMPS_NS;
 /* ---------------------------------------------------------------------- */
 NPairFullBinGhostIntel::NPairFullBinGhostIntel(LAMMPS *lmp) : NPairIntel(lmp) {}
 /* ----------------------------------------------------------------------
   binned neighbor list construction for all neighbors
   include neighbors of ghost atoms, but no "special neighbors" for ghosts
   every neighbor pair appears in list of both atoms i and j
 ------------------------------------------------------------------------- */
 void NPairFullBinGhostIntel::build(NeighList *list)
 {
  #ifdef _LMP_INTEL_OFFLOAD
  if (_fix->offload_noghost())
    error->all(FLERR,
      "The 'ghost no' option cannot be used with this USER-INTEL pair style.");
  #endif
  if (nstencil > INTEL_MAX_STENCIL_CHECK)
    error->all(FLERR, "Too many neighbor bins for USER-INTEL package.");
  #ifdef _LMP_INTEL_OFFLOAD
  if (exclude)
    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
  #endif
  if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
    fbi(list, _fix->get_mixed_buffers());
  else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
    fbi(list, _fix->get_double_buffers());
  else
    fbi(list, _fix->get_single_buffers());
  _fix->stop_watch(TIME_HOST_NEIGHBOR);
 }
 /* ---------------------------------------------------------------------- */
 template<class flt_t, class acc_t>
 void NPairFullBinGhostIntel::fbi(NeighList * list, 
 				 IntelBuffers<flt_t,acc_t> * buffers) 
 {
  const int nlocal = atom->nlocal;
  const int nall = atom->nlocal + atom->nghost;
  list->inum = atom->nlocal;
  list->gnum = atom->nghost;
  int host_start = _fix->host_start_neighbor();
  const int off_end = _fix->offload_end_neighbor();
  #ifdef _LMP_INTEL_OFFLOAD
  if (off_end) grow_stencil();
  if (_fix->full_host_list()) host_start = 0;
  int offload_noghost = _fix->offload_noghost();
  #endif
  // only uses offload_end_neighbor to check whether we are doing offloading
  // at all, no need to correct this later
  buffers->grow_list(list, nall, comm->nthreads, off_end,
 		     _fix->nbor_pack_width());
  int need_ic = 0;
  if (atom->molecular)
    dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
 			 neighbor->cutneighmax);
  if (need_ic) {
    fbi<flt_t,acc_t,1>(1, list, buffers, 0, off_end);
    fbi<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal);
  } else {
    fbi<flt_t,acc_t,0>(1, list, buffers, 0, off_end);
    fbi<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal);
  }
 }
 /* ---------------------------------------------------------------------- */
 template<class flt_t, class acc_t, int need_ic>
 void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, 
 				 IntelBuffers<flt_t,acc_t> * buffers,
 				 const int pstart, const int pend) {
  if (pend-pstart == 0) return;
  const int nall = atom->nlocal + atom->nghost;
  int pad = 1;
  int nall_t = nall;
  const int aend = nall;
  const int pack_width = _fix->nbor_pack_width();
  const ATOM_T * _noalias const x = buffers->get_x();
  int * _noalias const firstneigh = buffers->firstneigh(list);
  const int e_nall = nall_t;
  const int molecular = atom->molecular;
  int *ns = NULL;
  tagint *s = NULL;
  int tag_size = 0, special_size;
  if (buffers->need_tag()) tag_size = e_nall;
  if (molecular) {
    s = atom->special[0];
    ns = atom->nspecial[0];
    special_size = aend;
  } else {
    s = &buffers->_special_holder;
    ns = &buffers->_nspecial_holder;
    special_size = 0;
  }
  const tagint * _noalias const special = s;
  const int * _noalias const nspecial = ns;
  const int maxspecial = atom->maxspecial;
  const tagint * _noalias const tag = atom->tag;
  int * _noalias const ilist = list->ilist;
  int * _noalias numneigh = list->numneigh;
  int * _noalias const cnumneigh = buffers->cnumneigh(list);
  const int nstencil = this->nstencil;
  const int * _noalias const stencil = this->stencil;
  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
  const flt_t * _noalias const cutneighghostsq = 
    buffers->get_cutneighghostsq()[0];
  const int ntypes = atom->ntypes + 1;
  const int nlocal = atom->nlocal;
  #ifndef _LMP_INTEL_OFFLOAD
  int * const mask = atom->mask;
  tagint * const molecule = atom->molecule;
  #endif
  int *molindex = atom->molindex;
  int *molatom = atom->molatom;
  Molecule **onemols = atom->avec->onemols;
  int moltemplate;
  if (molecular == 2) moltemplate = 1;
  else moltemplate = 0;
  if (moltemplate) 
    error->all(FLERR, 
 	       "Can't use moltemplate with npair style full/bin/ghost/intel.");
  int tnum;
  int *overflow;
  double *timer_compute;
  #ifdef _LMP_INTEL_OFFLOAD
  if (offload) {
    timer_compute = _fix->off_watch_neighbor();
    tnum = buffers->get_off_threads();
    overflow = _fix->get_off_overflow_flag();
    _fix->stop_watch(TIME_HOST_NEIGHBOR);
    _fix->start_watch(TIME_OFFLOAD_LATENCY);
  } else
  #endif
  {
    tnum = comm->nthreads;
    overflow = _fix->get_overflow_flag();
  }
  const int nthreads = tnum;
  const int maxnbors = buffers->get_max_nbors();
  int * _noalias const atombin = buffers->get_atombin();
  const int * _noalias const binpacked = buffers->get_binpacked();
  const int xperiodic = domain->xperiodic;
  const int yperiodic = domain->yperiodic;
  const int zperiodic = domain->zperiodic;
  const flt_t xprd_half = domain->xprd_half;
  const flt_t yprd_half = domain->yprd_half;
  const flt_t zprd_half = domain->zprd_half;
  flt_t * _noalias const ncachex = buffers->get_ncachex();
  flt_t * _noalias const ncachey = buffers->get_ncachey();
  flt_t * _noalias const ncachez = buffers->get_ncachez();
  int * _noalias const ncachej = buffers->get_ncachej();
  int * _noalias const ncachejtype = buffers->get_ncachejtype();
  int * _noalias const ncachetag = buffers->get_ncachetag();
  const int ncache_stride = buffers->ncache_stride();
  const int mbinx = this->mbinx;
  const int mbiny = this->mbiny;
  const int mbinz = this->mbinz;
  const int * const stencilxyz = &this->stencilxyz[0][0];
  #ifdef _LMP_INTEL_OFFLOAD
  const int * _noalias const binhead = this->binhead;
  const int * _noalias const bins = this->bins;
  const int cop = _fix->coprocessor_number();
  const int separate_buffers = _fix->separate_buffers();
  #pragma offload target(mic:cop) if(offload) \
    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
    in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
    in(cutneighghostsq:length(0) alloc_if(0) free_if(0)) \
    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
    in(numneigh:length(0) alloc_if(0) free_if(0)) \
    in(ilist:length(0) alloc_if(0) free_if(0)) \
    in(atombin:length(aend) alloc_if(0) free_if(0)) \
    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
    in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
    in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \
    in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
    in(separate_buffers,aend,nlocal,molecular,ntypes,mbinx,mbiny) \
    in(mbinz,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
    in(stencilxyz:length(3*nstencil)) \
    out(overflow:length(5) alloc_if(0) free_if(0)) \
    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
    signal(tag)
  #endif
  {
    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime();
    #endif
    #ifdef _LMP_INTEL_OFFLOAD
    overflow[LMP_LOCAL_MIN] = 0;
    overflow[LMP_LOCAL_MAX] = aend - 1;
    overflow[LMP_GHOST_MIN] = e_nall;
    overflow[LMP_GHOST_MAX] = -1;
    #endif
    int nstencilp = 0;
    int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
    for (int k = 0; k < nstencil; k++) {
      binstart[nstencilp] = stencil[k];
      int end = stencil[k] + 1;
      for (int kk = k + 1; kk < nstencil; kk++) {
        if (stencil[kk-1]+1 == stencil[kk]) {
          end++;
          k++;
        } else break;
      }
      binend[nstencilp] = end;
      nstencilp++;
    }
    const int mbinyx = mbiny * mbinx;
    #if defined(_OPENMP)
    #pragma omp parallel
    #endif
    {
      const int num = aend;
      int tid, ifrom, ito;
      const double balance_factor = 2.0;
      const double ibalance_factor = 1.0 / balance_factor;
      const int gnum = num - nlocal;
      const int wlocal = static_cast<int>(ceil(balance_factor * nlocal));
      const int snum = wlocal + gnum;
      IP_PRE_omp_range_id(ifrom, ito, tid, snum, nthreads);
      if (ifrom < wlocal) ifrom = static_cast<int>(ibalance_factor * ifrom);
      else ifrom -= wlocal - nlocal;
      if (ito < wlocal) ito = static_cast<int>(ibalance_factor * ito);
      else ito -= wlocal - nlocal;
      int e_ito = ito;
      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
      int which;
      int pack_offset = maxnbors;
      int ct = (ifrom + tid * 2) * maxnbors;
      int *neighptr = firstneigh + ct;
      const int obound = pack_offset + maxnbors * 2;
      const int toffs = tid * ncache_stride;
      flt_t * _noalias const tx = ncachex + toffs;
      flt_t * _noalias const ty = ncachey + toffs;
      flt_t * _noalias const tz = ncachez + toffs;
      int * _noalias const tj = ncachej + toffs;
      int * _noalias const tjtype = ncachejtype + toffs;
      int * _noalias const ttag = ncachetag + toffs;
      // loop over all atoms in other bins in stencil, store every pair
      int istart, icount, ncount, oldbin = -9999999, lane, max_chunk;
      for (int i = ifrom; i < ito; i++) {
        const flt_t xtmp = x[i].x;
        const flt_t ytmp = x[i].y;
        const flt_t ztmp = x[i].z;
        const int itype = x[i].w;
        const tagint itag = tag[i];
        const int ioffset = ntypes * itype;
        const int ibin = atombin[i];
        if (ibin != oldbin) {
          oldbin = ibin;
          ncount = 0;
 	  if (i < nlocal) {
 	    for (int k = 0; k < nstencilp; k++) {
 	      const int bstart = binhead[ibin + binstart[k]];
 	      const int bend = binhead[ibin + binend[k]];
              #if defined(LMP_SIMD_COMPILER)
              #pragma vector aligned
              #pragma simd
              #endif
              for (int jj = bstart; jj < bend; jj++)
                tj[ncount++] = binpacked[jj];
 	    }
 	  } else {
 	    const int zbin = ibin / mbinyx;
 	    const int zrem = ibin % mbinyx;
 	    const int ybin = zrem / mbinx;
 	    const int xbin = zrem % mbinx;
 	    for (int k = 0; k < nstencil; k++) {
 	      const int xbin2 = xbin + stencilxyz[3 * k + 0];
 	      const int ybin2 = ybin + stencilxyz[3 * k + 1];
 	      const int zbin2 = zbin + stencilxyz[3 * k + 2];
 	      if (xbin2 < 0 || xbin2 >= mbinx ||
                  ybin2 < 0 || ybin2 >= mbiny ||
                  zbin2 < 0 || zbin2 >= mbinz) continue;
 	      const int bstart = binhead[ibin + stencil[k]];
 	      const int bend = binhead[ibin + stencil[k] + 1];
              #if defined(LMP_SIMD_COMPILER)
              #pragma vector aligned
              #pragma simd
              #endif
              for (int jj = bstart; jj < bend; jj++)
                tj[ncount++] = binpacked[jj];
 	    }
 	  } // if i < nlocal
          #if defined(LMP_SIMD_COMPILER)
          #pragma vector aligned
          #pragma simd
          #endif
          for (int u = 0; u < ncount; u++) {
            const int j = tj[u];
            tx[u] = x[j].x;
            ty[u] = x[j].y;
            tz[u] = x[j].z;
            tjtype[u] = x[j].w;
 	    ttag[u] = tag[j];
          }
 	} // if ibin != oldbin
        // ---------------------- Loop over other bins
        int n = maxnbors;
        int n2 = n * 2;
 	int *neighptr2 = neighptr;
 	const flt_t * _noalias cutsq;
 	if (i < nlocal) cutsq = cutneighsq;
 	else cutsq = cutneighghostsq;
 	const int icp = i;
        #if defined(LMP_SIMD_COMPILER)
        #pragma vector aligned
        #pragma ivdep
        #endif
        for (int u = 0; u < ncount; u++) {
          int addme = 1;
          int j = tj[u];
 	  if (i == j) addme = 0;
          // Cutoff Check
          const flt_t delx = xtmp - tx[u];
          const flt_t dely = ytmp - ty[u];
          const flt_t delz = ztmp - tz[u];
          const int jtype = tjtype[u];
 	  const int jtag = ttag[u];
          const flt_t rsq = delx * delx + dely * dely + delz * delz;
          if (rsq > cutsq[ioffset + jtype]) addme = 0;
          if (need_ic && icp < nlocal) {
            int no_special;
 	    ominimum_image_check(no_special, delx, dely, delz);
            if (no_special)
              j = -j - 1;
          }
 	  int flist = 0;
 	  if (itag > jtag) {
 	    if (((itag+jtag) & 1) == 0) flist = 1;
 	  } else if (itag < jtag) {
 	    if (((itag+jtag) & 1) == 1) flist = 1;
 	  } else {
 	    if (tz[u] < ztmp) flist = 1;
 	    else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
 	    else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp)
 	      flist = 1;
 	  }
 	  if (addme) {
 	    if (flist)
 	      neighptr2[n2++] = j;
 	    else
 	      neighptr[n++] = j;
 	  }
        } // for u
        #ifndef _LMP_INTEL_OFFLOAD
        if (exclude) {
          int alln = n;
          n = maxnbors;
          for (int u = pack_offset; u < alln; u++) {
            const int j = neighptr[u];
            int pj = j;
            if (need_ic)
              if (pj < 0) pj = -j - 1;
            const int jtype = x[pj].w;
            if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
            neighptr[n++] = j;
          }
 	  alln = n2;
 	  n2 = maxnbors * 2;
 	  for (int u = n2; u < alln; u++) {
 	    const int j = neighptr[u];
 	    int pj = j;
 	    if (need_ic)
 	      if (pj < 0) pj = -j - 1;
 	    const int jtype = x[pj].w;
 	    if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
 	    neighptr[n2++] = j;
 	  }
        }
        #endif
        int ns = n - maxnbors;
 	int alln = n;
 	atombin[i] = ns;
 	n = 0;
 	for (int u = maxnbors; u < alln; u++)
          neighptr[n++] = neighptr[u];
 	ns += n2 - maxnbors * 2;
 	for (int u = maxnbors * 2; u < n2; u++)
          neighptr[n++] = neighptr[u];
 	if (ns > maxnbors) *overflow = 1;
        ilist[i] = i;
        cnumneigh[i] = ct;
        numneigh[i] = ns;
 	ct += ns;
 	const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
 	const int edge = ct & (alignb - 1);
 	if (edge) ct += alignb - edge;
 	neighptr = firstneigh + ct;
 	if (ct + obound > list_size) {
 	  if (i < ito - 1) {
 	    *overflow = 1;
 	    ct = (ifrom + tid * 2) * maxnbors;
 	  }
 	}
      }
      if (*overflow == 1)
        for (int i = ifrom; i < ito; i++)
          numneigh[i] = 0;
      #ifdef _LMP_INTEL_OFFLOAD
      int ghost_offset = 0, nall_offset = e_nall;
      if (separate_buffers) {
        for (int i = ifrom; i < ito; ++i) {
          int * _noalias jlist = firstneigh + cnumneigh[i];
          const int jnum = numneigh[i];
          #if __INTEL_COMPILER+0 > 1499
          #pragma vector aligned
          #pragma simd
          #endif
          for (int jj = 0; jj < jnum; jj++) {
            int j = jlist[jj];
            if (need_ic && j < 0) j = -j - 1;
          }
        }
 	overflow[LMP_LOCAL_MIN] = 0;
 	overflow[LMP_LOCAL_MAX] = nlocal - 1;
 	overflow[LMP_GHOST_MIN] = nlocal;
 	overflow[LMP_GHOST_MAX] = e_nall - 1;
        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
        if (nghost < 0) nghost = 0;
        if (offload) {
          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
        } else {
          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
          nall_offset = nlocal + nghost;
        }
      } // if separate_buffers
      #endif
      if (molecular) {
 	int ito_m = ito;
 	if (ito >= nlocal) ito_m = nlocal; 
        for (int i = ifrom; i < ito_m; ++i) {
          int * _noalias jlist = firstneigh + cnumneigh[i];
          const int jnum = numneigh[i];
          #if defined(LMP_SIMD_COMPILER)
          #pragma vector aligned
          #pragma simd
          #endif
          for (int jj = 0; jj < jnum; jj++) {
            const int j = jlist[jj];
            if (need_ic && j < 0) {
              which = 0;
              jlist[jj] = -j - 1;
            } else
              ofind_special(which, special, nspecial, i, tag[j]);
            #ifdef _LMP_INTEL_OFFLOAD
            if (j >= nlocal) {
              if (j == e_nall)
                jlist[jj] = nall_offset;
              else if (which)
                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
              else jlist[jj]-=ghost_offset;
            } else
            #endif
            if (which) jlist[jj] = j ^ (which << SBBITS);
          }
        } // for i
      } // if molecular
      #ifdef _LMP_INTEL_OFFLOAD
      else if (separate_buffers) {
        for (int i = ifrom; i < ito; ++i) {
          int * _noalias jlist = firstneigh + cnumneigh[i];
          const int jnum = numneigh[i];
          int jj = 0;
          #pragma vector aligned
          #pragma simd
          for (jj = 0; jj < jnum; jj++) {
            if (jlist[jj] >= nlocal) {
              if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
              else jlist[jj] -= ghost_offset;
            }
          }
        }
      }
      #endif
    } // end omp
    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime() - *timer_compute;
    #endif
  } // end offload
  #ifdef _LMP_INTEL_OFFLOAD
  if (offload) {
    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
    _fix->start_watch(TIME_HOST_NEIGHBOR);
    for (int n = 0; n < aend; n++) {
      ilist[n] = n;
      numneigh[n] = 0;
    }
  } else {
    for (int i = 0; i < aend; i++)
      list->firstneigh[i] = firstneigh + cnumneigh[i];
    if (separate_buffers) {
      _fix->start_watch(TIME_PACK);
      _fix->set_neighbor_host_sizes();
      buffers->pack_sep_from_single(_fix->host_min_local(),
                                    _fix->host_used_local(),
                                    _fix->host_min_ghost(),
                                    _fix->host_used_ghost());
      _fix->stop_watch(TIME_PACK);
    }
  }
  #else
  #pragma vector aligned
  #pragma simd
  for (int i = 0; i < aend; i++)
    list->firstneigh[i] = firstneigh + cnumneigh[i];
  #endif
 }
--- a/src/USER-INTEL/npair_full_bin_ghost_intel.h
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.h
@ -0,0 +1,55 @@
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 #ifdef NPAIR_CLASS
 NPairStyle(full/bin/ghost/intel,
           NPairFullBinGhostIntel,
           NP_FULL | NP_BIN | NP_GHOST | NP_NEWTON | NP_NEWTOFF | 
           NP_ORTHO | NP_TRI | NP_INTEL)
 #else
 #ifndef LMP_NPAIR_FULL_BIN_GHOST_INTEL_H
 #define LMP_NPAIR_FULL_BIN_GHOST_INTEL_H
 #include "npair_intel.h"
 namespace LAMMPS_NS {
 class NPairFullBinGhostIntel : public NPairIntel {
 public:
  NPairFullBinGhostIntel(class LAMMPS *);
  ~NPairFullBinGhostIntel() {}
  void build(class NeighList *);
 private:
  template<class flt_t, class acc_t>
  void fbi(NeighList * list, IntelBuffers<flt_t,acc_t> * buffers);
  template<class flt_t, class acc_t, int need_ic>
  void fbi(const int offload, NeighList * list, 
 	   IntelBuffers<flt_t,acc_t> * buffers, 
           const int astart, const int aend);
 };
 }
 #endif
 #endif
 /* ERROR/WARNING messages:
 */
--- a/src/USER-INTEL/npair_intel.cpp
+++ b/src/USER-INTEL/npair_intel.cpp
@ -143,6 +143,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
  flt_t * _noalias const ncachez = buffers->get_ncachez();
  int * _noalias const ncachej = buffers->get_ncachej();
  int * _noalias const ncachejtype = buffers->get_ncachejtype();
  int * _noalias const ncachetag = buffers->get_ncachetag();
  const int ncache_stride = buffers->ncache_stride();
  #ifdef _LMP_INTEL_OFFLOAD
@ -165,7 +166,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
    in(atombin:length(aend) alloc_if(0) free_if(0)) \
    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
    in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
-    in(ncachejtype:length(0) alloc_if(0) free_if(0)) \
+    in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \
    in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
    in(pad_width,offload_end,separate_buffers,astart,aend,nlocal,molecular) \
    in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
@ -222,7 +223,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
      ito += astart;
      int e_ito = ito;
      if (THREE && ito == num) {
-        int imod = ito % pack_width;
+        int imod = ito & (pack_width - 1);
        if (imod) e_ito += pack_width - imod;
      }
      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
@ -241,6 +242,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
      flt_t * _noalias const tz = ncachez + toffs;
      int * _noalias const tj = ncachej + toffs;
      int * _noalias const tjtype = ncachejtype + toffs;
      int * _noalias const ttag = ncachetag + toffs;
      flt_t * _noalias itx;
      flt_t * _noalias ity;
@ -287,13 +289,14 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
            ty[u] = x[j].y;
            tz[u] = x[j].z;
            tjtype[u] = x[j].w;
 	    if (THREE) ttag[u] = tag[j];
          }
          if (FULL == 0 || TRI == 1) {
            icount = 0;
            istart = ncount;
            const int alignb = INTEL_DATA_ALIGN / sizeof(int);
-            int nedge = istart % alignb;
+            int nedge = istart & (alignb - 1);
            if (nedge) istart + (alignb - nedge);
            itx = tx + istart;
            ity = ty + istart;
@ -343,7 +346,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
            // i bin (half) check and offload ghost check
            if (j < nlocal) {
-              const int ijmod = (i + j) % 2;
+              const int ijmod = (i + j) & 1;
              if (i > j) {
                if (ijmod == 0) addme = 0;
              } else if (i < j) {
@ -424,8 +427,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
          }
          #endif
          int pj;
          if (THREE) pj = j;
          if (need_ic) {
            int no_special;
            ominimum_image_check(no_special, delx, dely, delz);
@ -434,12 +435,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
          }
          if (THREE) {
-            const int jtag = tag[pj];
+            const int jtag = ttag[u];
            int flist = 0;
            if (itag > jtag) {
-              if ((itag+jtag) % 2 == 0) flist = 1;
+	      if (((itag+jtag) & 1) == 0) flist = 1;
            } else if (itag < jtag) {
-              if ((itag+jtag) % 2 == 1) flist = 1;
+	      if (((itag+jtag) & 1) == 1) flist = 1;
            } else {
              if (tz[u] < ztmp) flist = 1;
              else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
@ -512,7 +513,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
          cnumneigh[i] += lane;
          numneigh[i] = ns;
        } else {
-          int edge = (n % pad_width);
+          int edge = n & (pad_width - 1);
          if (edge) {
            const int pad_end = n + (pad_width - edge);
            #if defined(LMP_SIMD_COMPILER)
@ -532,7 +533,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
          if (lane == pack_width) {
            ct += max_chunk * pack_width;
            const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-            const int edge = (ct % alignb);
+            const int edge = ct & (alignb - 1);
            if (edge) ct += alignb - edge;
            neighptr = firstneigh + ct;
            max_chunk = 0;
@ -548,7 +549,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
        } else {
          ct += n;
          const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-          const int edge = (ct % alignb);
+          const int edge = ct & (alignb - 1);
          if (edge) ct += alignb - edge;
          neighptr = firstneigh + ct;
          if (ct + obound > list_size) {
--- a/src/USER-INTEL/pair_airebo_intel.cpp
+++ b/src/USER-INTEL/pair_airebo_intel.cpp
--- a/src/USER-INTEL/pair_airebo_intel.h
+++ b/src/USER-INTEL/pair_airebo_intel.h
@ -0,0 +1,110 @@
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author: Markus Hohnerbach (RWTH)
 ------------------------------------------------------------------------- */
 #ifdef PAIR_CLASS
 PairStyle(airebo/intel,PairAIREBOIntel)
 #else
 #ifndef LMP_PAIR_AIREBO_INTEL_H
 #define LMP_PAIR_AIREBO_INTEL_H
 #include "pair.h"
 #include "fix_intel.h"
 #include "pair_airebo.h"
 //#include "airebo_common.h"
 namespace LAMMPS_NS {
 template<class flt_t, class acc_t>
 struct PairAIREBOIntelParam;
 class PairAIREBOIntel : public PairAIREBO {
 public:
  PairAIREBOIntel(class LAMMPS *);
  virtual ~PairAIREBOIntel();
  virtual void compute(int, int);
  virtual void init_style();
 protected:
  template <class flt_t, class acc_t>
  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers);
  template <int EVFLAG, int EFLAG, class flt_t, class acc_t>
  void eval(const int offload, const int vflag,
 	    IntelBuffers<flt_t,acc_t> * buffers,
 	    const int astart, const int aend);
  template <class flt_t, class acc_t>
  void pack_force_const(IntelBuffers<flt_t,acc_t> * buffers);
  template <class flt_t, class acc_t>
  PairAIREBOIntelParam<flt_t,acc_t> get_param();
  FixIntel * fix;
  int _cop;
  int * REBO_cnumneigh;
  int * REBO_num_skin;
  int * REBO_list_data;
 };
 }
 #endif
 #endif
 /* ERROR/WARNING messages:
 E: Illegal ... command
 Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.
 E: Incorrect args for pair coefficients
 Self-explanatory.  Check the input script or data file.
 E: Pair style AIREBO requires atom IDs
 This is a requirement to use the AIREBO potential.
 E: Pair style AIREBO requires newton pair on
 See the newton command.  This is a restriction to use the AIREBO
 potential.
 E: All pair coeffs are not set
 All pair coefficients must be set in the data file or by the
 pair_coeff command before running a simulation.
 E: Neighbor list overflow, boost neigh_modify one
 There are too many neighbors of a single atom.  Use the neigh_modify
 command to increase the max number of neighbors allowed for one atom.
 You may also want to boost the page size.
 E: Cannot open AIREBO potential file %s
 The specified AIREBO potential file cannot be opened.  Check that the
 path and name are correct.
 */
--- a/src/USER-INTEL/pair_airebo_morse_intel.cpp
+++ b/src/USER-INTEL/pair_airebo_morse_intel.cpp
@ -0,0 +1,37 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author: Markus Hohnerbach (RWTH)
 ------------------------------------------------------------------------- */
 #include "pair_airebo_morse_intel.h"
 #include "error.h"
 using namespace LAMMPS_NS;
 /* ---------------------------------------------------------------------- */
 PairAIREBOMorseIntel::PairAIREBOMorseIntel(LAMMPS *lmp) 
  : PairAIREBOIntel(lmp) {}
 /* ----------------------------------------------------------------------
   global settings
 ------------------------------------------------------------------------- */
 void PairAIREBOMorseIntel::settings(int narg, char **arg)
 {
  PairAIREBOIntel::settings(narg,arg);
  morseflag = 1;
 }
--- a/src/USER-INTEL/pair_airebo_morse_intel.h
+++ b/src/USER-INTEL/pair_airebo_morse_intel.h
@ -0,0 +1,40 @@
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author: Markus Hohnerbach (RWTH)
 ------------------------------------------------------------------------- */
 #ifdef PAIR_CLASS
 PairStyle(airebo/morse/intel,PairAIREBOMorseIntel)
 #else
 #ifndef LMP_PAIR_AIREBO_MORSE_INTEL_H
 #define LMP_PAIR_AIREBO_MORSE_INTEL_H
 #include "pair_airebo_intel.h"
 namespace LAMMPS_NS {
 class PairAIREBOMorseIntel : public PairAIREBOIntel {
 public:
  PairAIREBOMorseIntel(class LAMMPS *);
  virtual void settings(int, char **);
 };
 }
 #endif
 #endif
--- a/src/USER-INTEL/pair_eam_alloy_intel.cpp
+++ b/src/USER-INTEL/pair_eam_alloy_intel.cpp
@ -0,0 +1,326 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Stephen Foiles (SNL), Murray Daw (SNL)
 ------------------------------------------------------------------------- */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "pair_eam_alloy_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "memory.h"
 #include "error.h"
 using namespace LAMMPS_NS;
 #define MAXLINE 1024
 /* ---------------------------------------------------------------------- */
 PairEAMAlloyIntel::PairEAMAlloyIntel(LAMMPS *lmp) : PairEAMIntel(lmp)
 {
  one_coeff = 1;
 }
 /* ----------------------------------------------------------------------
   set coeffs for one or more type pairs
   read DYNAMO setfl file
 ------------------------------------------------------------------------- */
 void PairEAMAlloyIntel::coeff(int narg, char **arg)
 {
  int i,j;
  if (!allocated) allocate();
  if (narg != 3 + atom->ntypes)
    error->all(FLERR,"Incorrect args for pair coefficients");
  // insure I,J args are * *
  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
    error->all(FLERR,"Incorrect args for pair coefficients");
  // read EAM setfl file
  if (setfl) {
    for (i = 0; i < setfl->nelements; i++) delete [] setfl->elements[i];
    delete [] setfl->elements;
    delete [] setfl->mass;
    memory->destroy(setfl->frho);
    memory->destroy(setfl->rhor);
    memory->destroy(setfl->z2r);
    delete setfl;
  }
  setfl = new Setfl();
  read_file(arg[2]);
  // read args that map atom types to elements in potential file
  // map[i] = which element the Ith atom type is, -1 if NULL
  for (i = 3; i < narg; i++) {
    if (strcmp(arg[i],"NULL") == 0) {
      map[i-2] = -1;
      continue;
    }
    for (j = 0; j < setfl->nelements; j++)
      if (strcmp(arg[i],setfl->elements[j]) == 0) break;
    if (j < setfl->nelements) map[i-2] = j;
    else error->all(FLERR,"No matching element in EAM potential file");
  }
  // clear setflag since coeff() called once with I,J = * *
  int n = atom->ntypes;
  for (i = 1; i <= n; i++)
    for (j = i; j <= n; j++)
      setflag[i][j] = 0;
  // set setflag i,j for type pairs where both are mapped to elements
  // set mass of atom type if i = j
  int count = 0;
  for (i = 1; i <= n; i++) {
    for (j = i; j <= n; j++) {
      if (map[i] >= 0 && map[j] >= 0) {
        setflag[i][j] = 1;
        if (i == j) atom->set_mass(FLERR,i,setfl->mass[map[i]]);
        count++;
      }
      scale[i][j] = 1.0;
    }
  }
  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 }
 /* ----------------------------------------------------------------------
   read a multi-element DYNAMO setfl file
 ------------------------------------------------------------------------- */
 void PairEAMAlloyIntel::read_file(char *filename)
 {
  Setfl *file = setfl;
  // open potential file
  int me = comm->me;
  FILE *fptr;
  char line[MAXLINE];
  if (me == 0) {
    fptr = force->open_potential(filename);
    if (fptr == NULL) {
      char str[128];
      sprintf(str,"Cannot open EAM potential file %s",filename);
      error->one(FLERR,str);
    }
  }
  // read and broadcast header
  // extract element names from nelements line
  int n;
  if (me == 0) {
    fgets(line,MAXLINE,fptr);
    fgets(line,MAXLINE,fptr);
    fgets(line,MAXLINE,fptr);
    fgets(line,MAXLINE,fptr);
    n = strlen(line) + 1;
  }
  MPI_Bcast(&n,1,MPI_INT,0,world);
  MPI_Bcast(line,n,MPI_CHAR,0,world);
  sscanf(line,"%d",&file->nelements);
  int nwords = atom->count_words(line);
  if (nwords != file->nelements + 1)
    error->all(FLERR,"Incorrect element names in EAM potential file");
  char **words = new char*[file->nelements+1];
  nwords = 0;
  strtok(line," \t\n\r\f");
  while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
  file->elements = new char*[file->nelements];
  for (int i = 0; i < file->nelements; i++) {
    n = strlen(words[i]) + 1;
    file->elements[i] = new char[n];
    strcpy(file->elements[i],words[i]);
  }
  delete [] words;
  if (me == 0) {
    fgets(line,MAXLINE,fptr);
    sscanf(line,"%d %lg %d %lg %lg",
           &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
  }
  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
  file->mass = new double[file->nelements];
  memory->create(file->frho,file->nelements,file->nrho+1,"pair:frho");
  memory->create(file->rhor,file->nelements,file->nr+1,"pair:rhor");
  memory->create(file->z2r,file->nelements,file->nelements,file->nr+1,
                 "pair:z2r");
  int i,j,tmp;
  for (i = 0; i < file->nelements; i++) {
    if (me == 0) {
      fgets(line,MAXLINE,fptr);
      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
    }
    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
    if (me == 0) grab(fptr,file->nr,&file->rhor[i][1]);
    MPI_Bcast(&file->rhor[i][1],file->nr,MPI_DOUBLE,0,world);
  }
  for (i = 0; i < file->nelements; i++)
    for (j = 0; j <= i; j++) {
      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
    }
  // close the potential file
  if (me == 0) fclose(fptr);
 }
 /* ----------------------------------------------------------------------
   copy read-in setfl potential to standard array format
 ------------------------------------------------------------------------- */
 void PairEAMAlloyIntel::file2array()
 {
  int i,j,m,n;
  int ntypes = atom->ntypes;
  // set function params directly from setfl file
  nrho = setfl->nrho;
  nr = setfl->nr;
  drho = setfl->drho;
  dr = setfl->dr;
  rhomax = (nrho-1) * drho;
  // ------------------------------------------------------------------
  // setup frho arrays
  // ------------------------------------------------------------------
  // allocate frho arrays
  // nfrho = # of setfl elements + 1 for zero array
  nfrho = setfl->nelements + 1;
  memory->destroy(frho);
  memory->create(frho,nfrho,nrho+1,"pair:frho");
  // copy each element's frho to global frho
  for (i = 0; i < setfl->nelements; i++)
    for (m = 1; m <= nrho; m++) frho[i][m] = setfl->frho[i][m];
  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
  // this is necessary b/c fp is still computed for non-EAM atoms
  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
  // then map it to last frho array of zeroes
  for (i = 1; i <= ntypes; i++)
    if (map[i] >= 0) type2frho[i] = map[i];
    else type2frho[i] = nfrho-1;
  // ------------------------------------------------------------------
  // setup rhor arrays
  // ------------------------------------------------------------------
  // allocate rhor arrays
  // nrhor = # of setfl elements
  nrhor = setfl->nelements;
  memory->destroy(rhor);
  memory->create(rhor,nrhor,nr+1,"pair:rhor");
  // copy each element's rhor to global rhor
  for (i = 0; i < setfl->nelements; i++)
    for (m = 1; m <= nr; m++) rhor[i][m] = setfl->rhor[i][m];
  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
  // for setfl files, I,J mapping only depends on I
  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
  for (i = 1; i <= ntypes; i++)
    for (j = 1; j <= ntypes; j++)
      type2rhor[i][j] = map[i];
  // ------------------------------------------------------------------
  // setup z2r arrays
  // ------------------------------------------------------------------
  // allocate z2r arrays
  // nz2r = N*(N+1)/2 where N = # of setfl elements
  nz2r = setfl->nelements * (setfl->nelements+1) / 2;
  memory->destroy(z2r);
  memory->create(z2r,nz2r,nr+1,"pair:z2r");
  // copy each element pair z2r to global z2r, only for I >= J
  n = 0;
  for (i = 0; i < setfl->nelements; i++)
    for (j = 0; j <= i; j++) {
      for (m = 1; m <= nr; m++) z2r[n][m] = setfl->z2r[i][j][m];
      n++;
    }
  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
  // set of z2r arrays only fill lower triangular Nelement matrix
  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
  // swap indices when irow < icol to stay lower triangular
  // if map = -1 (non-EAM atom in pair hybrid):
  //   type2z2r is not used by non-opt
  //   but set type2z2r to 0 since accessed by opt
  int irow,icol;
  for (i = 1; i <= ntypes; i++) {
    for (j = 1; j <= ntypes; j++) {
      irow = map[i];
      icol = map[j];
      if (irow == -1 || icol == -1) {
        type2z2r[i][j] = 0;
        continue;
      }
      if (irow < icol) {
        irow = map[j];
        icol = map[i];
      }
      n = 0;
      for (m = 0; m < irow; m++) n += m + 1;
      n += icol;
      type2z2r[i][j] = n;
    }
  }
 }
--- a/src/USER-INTEL/pair_eam_alloy_intel.h
+++ b/src/USER-INTEL/pair_eam_alloy_intel.h
@ -0,0 +1,43 @@
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef PAIR_CLASS
 PairStyle(eam/alloy/intel,PairEAMAlloyIntel)
 #else
 #ifndef LMP_PAIR_EAM_ALLOY_INTEL_H
 #define LMP_PAIR_EAM_ALLOY_INTEL_H
 #include "pair_eam_intel.h"
 namespace LAMMPS_NS {
 // need virtual public b/c of how eam/alloy/opt inherits from it
 class PairEAMAlloyIntel : virtual public PairEAMIntel {
 public:
  PairEAMAlloyIntel(class LAMMPS *);
  virtual ~PairEAMAlloyIntel() {}
  void coeff(int, char **);
 protected:
  void read_file(char *);
  void file2array();
 };
 }
 #endif
 #endif
--- a/src/USER-INTEL/pair_eam_fs_intel.cpp
+++ b/src/USER-INTEL/pair_eam_fs_intel.cpp
@ -0,0 +1,335 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Tim Lau (MIT)
 ------------------------------------------------------------------------- */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "pair_eam_fs_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "memory.h"
 #include "error.h"
 using namespace LAMMPS_NS;
 #define MAXLINE 1024
 /* ---------------------------------------------------------------------- */
 PairEAMFSIntel::PairEAMFSIntel(LAMMPS *lmp) : PairEAMIntel(lmp)
 {
  one_coeff = 1;
 }
 /* ----------------------------------------------------------------------
   set coeffs for one or more type pairs
   read EAM Finnis-Sinclair file
 ------------------------------------------------------------------------- */
 void PairEAMFSIntel::coeff(int narg, char **arg)
 {
  int i,j;
  if (!allocated) allocate();
  if (narg != 3 + atom->ntypes)
    error->all(FLERR,"Incorrect args for pair coefficients");
  // insure I,J args are * *
  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
    error->all(FLERR,"Incorrect args for pair coefficients");
  // read EAM Finnis-Sinclair file
  if (fs) {
    for (i = 0; i < fs->nelements; i++) delete [] fs->elements[i];
    delete [] fs->elements;
    delete [] fs->mass;
    memory->destroy(fs->frho);
    memory->destroy(fs->rhor);
    memory->destroy(fs->z2r);
    delete fs;
  }
  fs = new Fs();
  read_file(arg[2]);
  // read args that map atom types to elements in potential file
  // map[i] = which element the Ith atom type is, -1 if NULL
  for (i = 3; i < narg; i++) {
    if (strcmp(arg[i],"NULL") == 0) {
      map[i-2] = -1;
      continue;
    }
    for (j = 0; j < fs->nelements; j++)
      if (strcmp(arg[i],fs->elements[j]) == 0) break;
    if (j < fs->nelements) map[i-2] = j;
    else error->all(FLERR,"No matching element in EAM potential file");
  }
  // clear setflag since coeff() called once with I,J = * *
  int n = atom->ntypes;
  for (i = 1; i <= n; i++)
    for (j = i; j <= n; j++)
      setflag[i][j] = 0;
  // set setflag i,j for type pairs where both are mapped to elements
  // set mass of atom type if i = j
  int count = 0;
  for (i = 1; i <= n; i++) {
    for (j = i; j <= n; j++) {
      if (map[i] >= 0 && map[j] >= 0) {
        setflag[i][j] = 1;
        if (i == j) atom->set_mass(FLERR,i,fs->mass[map[i]]);
        count++;
      }
      scale[i][j] = 1.0;
    }
  }
  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 }
 /* ----------------------------------------------------------------------
   read a multi-element DYNAMO setfl file
 ------------------------------------------------------------------------- */
 void PairEAMFSIntel::read_file(char *filename)
 {
  Fs *file = fs;
  // open potential file
  int me = comm->me;
  FILE *fptr;
  char line[MAXLINE];
  if (me == 0) {
    fptr = force->open_potential(filename);
    if (fptr == NULL) {
      char str[128];
      sprintf(str,"Cannot open EAM potential file %s",filename);
      error->one(FLERR,str);
    }
  }
  // read and broadcast header
  // extract element names from nelements line
  int n;
  if (me == 0) {
    fgets(line,MAXLINE,fptr);
    fgets(line,MAXLINE,fptr);
    fgets(line,MAXLINE,fptr);
    fgets(line,MAXLINE,fptr);
    n = strlen(line) + 1;
  }
  MPI_Bcast(&n,1,MPI_INT,0,world);
  MPI_Bcast(line,n,MPI_CHAR,0,world);
  sscanf(line,"%d",&file->nelements);
  int nwords = atom->count_words(line);
  if (nwords != file->nelements + 1)
    error->all(FLERR,"Incorrect element names in EAM potential file");
  char **words = new char*[file->nelements+1];
  nwords = 0;
  strtok(line," \t\n\r\f");
  while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
  file->elements = new char*[file->nelements];
  for (int i = 0; i < file->nelements; i++) {
    n = strlen(words[i]) + 1;
    file->elements[i] = new char[n];
    strcpy(file->elements[i],words[i]);
  }
  delete [] words;
  if (me == 0) {
    fgets(line,MAXLINE,fptr);
    sscanf(line,"%d %lg %d %lg %lg",
           &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
  }
  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
  file->mass = new double[file->nelements];
  memory->create(file->frho,file->nelements,file->nrho+1,
                                              "pair:frho");
  memory->create(file->rhor,file->nelements,file->nelements,
                 file->nr+1,"pair:rhor");
  memory->create(file->z2r,file->nelements,file->nelements,
                 file->nr+1,"pair:z2r");
  int i,j,tmp;
  for (i = 0; i < file->nelements; i++) {
    if (me == 0) {
      fgets(line,MAXLINE,fptr);
      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
    }
    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
    for (j = 0; j < file->nelements; j++) {
      if (me == 0) grab(fptr,file->nr,&file->rhor[i][j][1]);
      MPI_Bcast(&file->rhor[i][j][1],file->nr,MPI_DOUBLE,0,world);
    }
  }
  for (i = 0; i < file->nelements; i++)
    for (j = 0; j <= i; j++) {
      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
    }
  // close the potential file
  if (me == 0) fclose(fptr);
 }
 /* ----------------------------------------------------------------------
   copy read-in setfl potential to standard array format
 ------------------------------------------------------------------------- */
 void PairEAMFSIntel::file2array()
 {
  int i,j,m,n;
  int ntypes = atom->ntypes;
  // set function params directly from fs file
  nrho = fs->nrho;
  nr = fs->nr;
  drho = fs->drho;
  dr = fs->dr;
  rhomax = (nrho-1) * drho;
  // ------------------------------------------------------------------
  // setup frho arrays
  // ------------------------------------------------------------------
  // allocate frho arrays
  // nfrho = # of fs elements + 1 for zero array
  nfrho = fs->nelements + 1;
  memory->destroy(frho);
  memory->create(frho,nfrho,nrho+1,"pair:frho");
  // copy each element's frho to global frho
  for (i = 0; i < fs->nelements; i++)
    for (m = 1; m <= nrho; m++) frho[i][m] = fs->frho[i][m];
  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
  // this is necessary b/c fp is still computed for non-EAM atoms
  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
  // then map it to last frho array of zeroes
  for (i = 1; i <= ntypes; i++)
    if (map[i] >= 0) type2frho[i] = map[i];
    else type2frho[i] = nfrho-1;
  // ------------------------------------------------------------------
  // setup rhor arrays
  // ------------------------------------------------------------------
  // allocate rhor arrays
  // nrhor = square of # of fs elements
  nrhor = fs->nelements * fs->nelements;
  memory->destroy(rhor);
  memory->create(rhor,nrhor,nr+1,"pair:rhor");
  // copy each element pair rhor to global rhor
  n = 0;
  for (i = 0; i < fs->nelements; i++)
    for (j = 0; j < fs->nelements; j++) {
      for (m = 1; m <= nr; m++) rhor[n][m] = fs->rhor[i][j][m];
      n++;
    }
  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
  // for fs files, there is a full NxN set of rhor arrays
  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
  for (i = 1; i <= ntypes; i++)
    for (j = 1; j <= ntypes; j++)
      type2rhor[i][j] = map[i] * fs->nelements + map[j];
  // ------------------------------------------------------------------
  // setup z2r arrays
  // ------------------------------------------------------------------
  // allocate z2r arrays
  // nz2r = N*(N+1)/2 where N = # of fs elements
  nz2r = fs->nelements * (fs->nelements+1) / 2;
  memory->destroy(z2r);
  memory->create(z2r,nz2r,nr+1,"pair:z2r");
  // copy each element pair z2r to global z2r, only for I >= J
  n = 0;
  for (i = 0; i < fs->nelements; i++)
    for (j = 0; j <= i; j++) {
      for (m = 1; m <= nr; m++) z2r[n][m] = fs->z2r[i][j][m];
      n++;
    }
  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
  // set of z2r arrays only fill lower triangular Nelement matrix
  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
  // swap indices when irow < icol to stay lower triangular
  // if map = -1 (non-EAM atom in pair hybrid):
  //   type2z2r is not used by non-opt
  //   but set type2z2r to 0 since accessed by opt
  int irow,icol;
  for (i = 1; i <= ntypes; i++) {
    for (j = 1; j <= ntypes; j++) {
      irow = map[i];
      icol = map[j];
      if (irow == -1 || icol == -1) {
        type2z2r[i][j] = 0;
        continue;
      }
      if (irow < icol) {
        irow = map[j];
        icol = map[i];
      }
      n = 0;
      for (m = 0; m < irow; m++) n += m + 1;
      n += icol;
      type2z2r[i][j] = n;
    }
  }
 }
--- a/src/USER-INTEL/pair_eam_fs_intel.h
+++ b/src/USER-INTEL/pair_eam_fs_intel.h
@ -0,0 +1,43 @@
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef PAIR_CLASS
 PairStyle(eam/fs/intel,PairEAMFSIntel)
 #else
 #ifndef LMP_PAIR_EAM_FS_INTEL_H
 #define LMP_PAIR_EAM_FS_INTEL_H
 #include "pair_eam_intel.h"
 namespace LAMMPS_NS {
 // need virtual public b/c of how eam/fs/opt inherits from it
 class PairEAMFSIntel : virtual public PairEAMIntel {
 public:
  PairEAMFSIntel(class LAMMPS *);
  virtual ~PairEAMFSIntel() {}
  void coeff(int, char **);
 protected:
  void read_file(char *);
  void file2array();
 };
 }
 #endif
 #endif
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@ -428,7 +428,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
          } else
            multiple_forms = true;
        }
-        const int edge = (packed_j % pad_width);
+        const int edge = packed_j & (pad_width - 1);
        if (edge) {
          const int packed_end = packed_j + (pad_width - edge);
          #if defined(LMP_SIMD_COMPILER)
--- a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
@ -0,0 +1,595 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   This software is distributed under the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 #include <math.h>
 #include "pair_lj_charmm_coul_charmm_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "group.h"
 #include "memory.h"
 #include "modify.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "memory.h"
 #include "suffix.h"
 using namespace LAMMPS_NS;
 #define LJ_T typename IntelBuffers<flt_t,flt_t>::vec4_t
 /* ---------------------------------------------------------------------- */
 PairLJCharmmCoulCharmmIntel::PairLJCharmmCoulCharmmIntel(LAMMPS *lmp) :
  PairLJCharmmCoulCharmm(lmp)
 {
  suffix_flag |= Suffix::INTEL;
 }
 /* ---------------------------------------------------------------------- */
 PairLJCharmmCoulCharmmIntel::~PairLJCharmmCoulCharmmIntel()
 {
 }
 /* ---------------------------------------------------------------------- */
 void PairLJCharmmCoulCharmmIntel::compute(int eflag, int vflag)
 {
  if (fix->precision()==FixIntel::PREC_MODE_MIXED)
    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                          force_const_single);
  else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                           force_const_double);
  else
    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                         force_const_single);
  fix->balance_stamp();
  vflag_fdotr = 0;
 }
 template <class flt_t, class acc_t>
 void PairLJCharmmCoulCharmmIntel::compute(int eflag, int vflag,
                                        IntelBuffers<flt_t,acc_t> *buffers,
                                        const ForceConst<flt_t> &fc)
 {
  if (eflag || vflag) {
    ev_setup(eflag,vflag);
  } else evflag = vflag_fdotr = 0;
  const int inum = list->inum;
  const int nthreads = comm->nthreads;
  const int host_start = fix->host_start_pair();
  const int offload_end = fix->offload_end_pair();
  const int ago = neighbor->ago;
  if (ago != 0 && fix->separate_buffers() == 0) {
    fix->start_watch(TIME_PACK);
    int packthreads;
    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
    else packthreads = 1;
    #if defined(_OPENMP)
    #pragma omp parallel if(packthreads > 1)
    #endif
    {
      int ifrom, ito, tid;
      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
                                packthreads, sizeof(ATOM_T));
      buffers->thr_pack(ifrom,ito,ago);
    }
    fix->stop_watch(TIME_PACK);
  }
  // -------------------- Regular version
  int ovflag = 0;
  if (vflag_fdotr) ovflag = 2;
  else if (vflag) ovflag = 1;
  if (eflag) {
    if (force->newton_pair) {
      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
    } else {
      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
    }
  } else {
    if (force->newton_pair) {
      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
    } else {
      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
    }
  }
 }
 /* ---------------------------------------------------------------------- */
 template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
                                     IntelBuffers<flt_t,acc_t> *buffers,
                                     const ForceConst<flt_t> &fc,
                                     const int astart, const int aend)
 {
  const int inum = aend - astart;
  if (inum == 0) return;
  int nlocal, nall, minlocal;
  fix->get_buffern(offload, nlocal, nall, minlocal);
  const int ago = neighbor->ago;
  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
  ATOM_T * _noalias const x = buffers->get_x(offload);
  flt_t * _noalias const q = buffers->get_q(offload);
  const int * _noalias const numneigh = list->numneigh;
  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
  const int * _noalias const firstneigh = buffers->firstneigh(list);
  const flt_t * _noalias const special_coul = fc.special_coul;
  const flt_t * _noalias const special_lj = fc.special_lj;
  const flt_t qqrd2e = force->qqrd2e;
  const flt_t inv_denom_lj = (flt_t)1.0/denom_lj;
  const flt_t inv_denom_coul = (flt_t)1.0/denom_coul;
  const flt_t * _noalias const cutsq = fc.cutsq[0];
  const LJ_T * _noalias const lj = fc.lj[0];
  const flt_t cut_ljsq = fc.cut_ljsq;
  const flt_t cut_lj_innersq = fc.cut_lj_innersq;
  const flt_t cut_coul_innersq = fc.cut_coul_innersq;
  const flt_t cut_coulsq = fc.cut_coulsq;
  const int ntypes = atom->ntypes + 1;
  const int eatom = this->eflag_atom;
  flt_t * _noalias const ccachex = buffers->get_ccachex();
  flt_t * _noalias const ccachey = buffers->get_ccachey();
  flt_t * _noalias const ccachez = buffers->get_ccachez();
  flt_t * _noalias const ccachew = buffers->get_ccachew();
  int * _noalias const ccachei = buffers->get_ccachei();
  int * _noalias const ccachej = buffers->get_ccachej();
  const int ccache_stride = _ccache_stride;
  // Determine how much data to transfer
  int x_size, q_size, f_stride, ev_size, separate_flag;
  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
                       buffers, offload, fix, separate_flag,
                       x_size, q_size, ev_size, f_stride);
  int tc;
  FORCE_T * _noalias f_start;
  acc_t * _noalias ev_global;
  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
  const int nthreads = tc;
  #ifdef _LMP_INTEL_OFFLOAD
  int *overflow = fix->get_off_overflow_flag();
  double *timer_compute = fix->off_watch_pair();
  if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
  #pragma offload target(mic:_cop) if(offload) \
    in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
    in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \
    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
    in(numneigh:length(0) alloc_if(0) free_if(0)) \
    in(x:length(x_size) alloc_if(0) free_if(0)) \
    in(q:length(q_size) alloc_if(0) free_if(0)) \
    in(overflow:length(0) alloc_if(0) free_if(0)) \
    in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
    in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
    in(ccache_stride,nthreads,qqrd2e,inum,nall,ntypes,cut_coulsq) \
    in(vflag,eatom,f_stride,separate_flag,offload) \
    in(astart,cut_ljsq,cut_lj_innersq,nlocal,inv_denom_lj,minlocal) \
    in(inv_denom_coul,cut_coul_innersq) \
    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
    signal(f_start)
  #endif
  {
    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime();
    #endif
    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
                              f_stride, x, q);
    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
    if (EFLAG) oevdwl = oecoul = (acc_t)0;
    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
    // loop over neighbors of my atoms
    #if defined(_OPENMP)
    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
    #endif
    {
      int iifrom, iip, iito, tid;
      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
      iifrom += astart;
      iito += astart;
      int foff;
      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
      else foff = -minlocal;
      FORCE_T * _noalias const f = f_start + foff;
      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
      flt_t cutboth = cut_coulsq;
      const int toffs = tid * ccache_stride;
      flt_t * _noalias const tdelx = ccachex + toffs;
      flt_t * _noalias const tdely = ccachey + toffs;
      flt_t * _noalias const tdelz = ccachez + toffs;
      flt_t * _noalias const trsq = ccachew + toffs;
      int * _noalias const tj = ccachei + toffs;
      int * _noalias const tjtype = ccachej + toffs;
      for (int i = iifrom; i < iito; i += iip) {
        //        const int i = ilist[ii];
        const int itype = x[i].w;
        const int ptr_off = itype * ntypes;
        const flt_t * _noalias const cutsqi = cutsq + ptr_off;
        const LJ_T * _noalias const lji = lj + ptr_off;
        const int   * _noalias const jlist = firstneigh + cnumneigh[i];
        const int jnum = numneigh[i];
        acc_t fxtmp,fytmp,fztmp,fwtmp;
        acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
        const flt_t xtmp = x[i].x;
        const flt_t ytmp = x[i].y;
        const flt_t ztmp = x[i].z;
        const flt_t qtmp = q[i];
        fxtmp = fytmp = fztmp = (acc_t)0;
        if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
        if (NEWTON_PAIR == 0)
          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
        int ej = 0;
        #if defined(LMP_SIMD_COMPILER)
        #pragma vector aligned
        #pragma ivdep
        #endif
        for (int jj = 0; jj < jnum; jj++) {
          const int j = jlist[jj] & NEIGHMASK;
          const flt_t delx = xtmp - x[j].x;
          const flt_t dely = ytmp - x[j].y;
          const flt_t delz = ztmp - x[j].z;
          const flt_t rsq = delx * delx + dely * dely + delz * delz;
          if (rsq < cut_coulsq) {
            trsq[ej]=rsq;
            tdelx[ej]=delx;
            tdely[ej]=dely;
            tdelz[ej]=delz;
            tjtype[ej]=x[j].w;
            tj[ej]=jlist[jj];
            ej++;
          }
        }
        #if defined(LMP_SIMD_COMPILER)
        #pragma vector aligned
        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
                               sv0, sv1, sv2, sv3, sv4, sv5)
        #endif
        for (int jj = 0; jj < ej; jj++) {
          flt_t forcecoul, forcelj, evdwl;
          forcecoul = forcelj = evdwl = (flt_t)0.0;
          const int j = tj[jj] & NEIGHMASK;
          const int sbindex = tj[jj] >> SBBITS & 3;
          const flt_t rsq = trsq[jj];
          const flt_t r2inv = (flt_t)1.0 / rsq;
 	  const flt_t r_inv = (flt_t)1.0 / sqrt(rsq);
 	  forcecoul = qqrd2e * qtmp * q[j] * r_inv;
 	  if (rsq > cut_coul_innersq) {
 	    const flt_t ccr = cut_coulsq - rsq;
 	    const flt_t switch1 = ccr * ccr * inv_denom_coul *
              (cut_coulsq + (flt_t)2.0 * rsq - (flt_t)3.0 * cut_coul_innersq);
            forcecoul *= switch1; 
          }
          #ifdef INTEL_VMASK
          if (rsq < cut_ljsq) {
          #endif
 	    const int jtype = tjtype[jj];
            flt_t r6inv = r2inv * r2inv * r2inv;
            forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
            if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
            #ifdef INTEL_VMASK
            if (rsq > cut_lj_innersq) {
            #endif
              const flt_t drsq = cut_ljsq - rsq;
              const flt_t cut2 = (rsq - cut_lj_innersq) * drsq;
              const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) *
                  inv_denom_lj;
              const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj;
              if (EFLAG) {
                #ifndef INTEL_VMASK
                if (rsq > cut_lj_innersq) {
                #endif
                  forcelj = forcelj * switch1 + evdwl * switch2;
                  evdwl *= switch1;
                #ifndef INTEL_VMASK
                }
                #endif
              } else {
                const flt_t philj = r6inv * (lji[jtype].z*r6inv -
                    lji[jtype].w);
                #ifndef INTEL_VMASK
                if (rsq > cut_lj_innersq)
                #endif
                  forcelj =  forcelj * switch1 + philj * switch2;
              }
            #ifdef INTEL_VMASK
            }
            #endif
          #ifdef INTEL_VMASK
          }
          #else
          if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
          #endif
 	  if (sbindex) {
  	    const flt_t factor_coul = special_coul[sbindex];
 	    forcecoul *= factor_coul;
 	    const flt_t factor_lj = special_lj[sbindex];
 	    forcelj *= factor_lj;
 	    if (EFLAG) evdwl *= factor_lj;
          }
          const flt_t fpair = (forcecoul + forcelj) * r2inv;
          const flt_t fpx = fpair * tdelx[jj];
          fxtmp += fpx;
          if (NEWTON_PAIR) f[j].x -= fpx;
          const flt_t fpy = fpair * tdely[jj];
          fytmp += fpy;
          if (NEWTON_PAIR) f[j].y -= fpy;
          const flt_t fpz = fpair * tdelz[jj];
          fztmp += fpz;
          if (NEWTON_PAIR) f[j].z -= fpz;
          if (EFLAG) {
            sevdwl += evdwl;
            secoul += forcecoul;
            if (eatom) {
              fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * forcecoul;
              if (NEWTON_PAIR)
                f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * forcecoul;
            }
          }
          if (NEWTON_PAIR == 0)
            IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
                                  fpx, fpy, fpz);
        } // for jj
        if (NEWTON_PAIR) {
          f[i].x += fxtmp;
          f[i].y += fytmp;
          f[i].z += fztmp;
        } else {
          f[i].x = fxtmp;
          f[i].y = fytmp;
          f[i].z = fztmp;
        }
        IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
      } // for ii
      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
                              ov4, ov5);
    } // end of omp parallel region
    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                        ov0, ov1, ov2, ov3, ov4, ov5);
    if (EFLAG) {
      if (NEWTON_PAIR == 0) {
        oevdwl *= (acc_t)0.5;
        oecoul *= (acc_t)0.5;
      }
      ev_global[0] = oevdwl;
      ev_global[1] = oecoul;
    }
    if (vflag) {
      if (NEWTON_PAIR == 0) {
        ov0 *= (acc_t)0.5;
        ov1 *= (acc_t)0.5;
        ov2 *= (acc_t)0.5;
        ov3 *= (acc_t)0.5;
        ov4 *= (acc_t)0.5;
        ov5 *= (acc_t)0.5;
      }
      ev_global[2] = ov0;
      ev_global[3] = ov1;
      ev_global[4] = ov2;
      ev_global[5] = ov3;
      ev_global[6] = ov4;
      ev_global[7] = ov5;
    }
    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime() - *timer_compute;
    #endif
  } // end of offload region
  if (offload)
    fix->stop_watch(TIME_OFFLOAD_LATENCY);
  else
    fix->stop_watch(TIME_HOST_PAIR);
  if (EFLAG || vflag)
    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
  else
    fix->add_result_array(f_start, 0, offload);
 }
 /* ---------------------------------------------------------------------- */
 void PairLJCharmmCoulCharmmIntel::init_style()
 {
  PairLJCharmmCoulCharmm::init_style();
  if (force->newton_pair == 0) {
    neighbor->requests[neighbor->nrequest-1]->half = 0;
    neighbor->requests[neighbor->nrequest-1]->full = 1;
  }
  neighbor->requests[neighbor->nrequest-1]->intel = 1;
  int ifix = modify->find_fix("package_intel");
  if (ifix < 0)
    error->all(FLERR,
               "The 'package intel' command is required for /intel styles");
  fix = static_cast<FixIntel *>(modify->fix[ifix]);
  fix->pair_init_check();
  #ifdef _LMP_INTEL_OFFLOAD
  _cop = fix->coprocessor_number();
  #endif
  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
    pack_force_const(force_const_single, fix->get_mixed_buffers());
  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
    pack_force_const(force_const_double, fix->get_double_buffers());
  else
    pack_force_const(force_const_single, fix->get_single_buffers());
 }
 template <class flt_t, class acc_t>
 void PairLJCharmmCoulCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
                                          IntelBuffers<flt_t,acc_t> *buffers)
 {
  int off_ccache = 0;
  #ifdef _LMP_INTEL_OFFLOAD
  if (_cop >= 0) off_ccache = 1;
  #endif
  buffers->grow_ccache(off_ccache, comm->nthreads, 1);
  _ccache_stride = buffers->ccache_stride();
  int tp1 = atom->ntypes + 1;
  fc.set_ntypes(tp1, memory, _cop);
  buffers->set_ntypes(tp1);
  flt_t **cutneighsq = buffers->get_cutneighsq();
  // Repeat cutsq calculation because done after call to init_style
  double cut, cutneigh;
  if (cut_lj > cut_coul)
    error->all(FLERR,
         "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
        cut = init_one(i, j);
        cutneigh = cut + neighbor->skin;
        cutsq[i][j] = cutsq[j][i] = cut*cut;
        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
      }
    }
  }
  cut_coul_innersq = cut_coul_inner * cut_coul_inner;
  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
  cut_ljsq = cut_lj * cut_lj;
  cut_coulsq = cut_coul * cut_coul;
  cut_bothsq = MAX(cut_ljsq, cut_coulsq);
  fc.cut_coulsq = cut_coulsq;
  fc.cut_ljsq = cut_ljsq;
  fc.cut_coul_innersq = cut_coul_innersq;
  fc.cut_lj_innersq = cut_lj_innersq;
  for (int i = 0; i < 4; i++) {
    fc.special_lj[i] = force->special_lj[i];
    fc.special_coul[i] = force->special_coul[i];
    fc.special_coul[0] = 1.0;
    fc.special_lj[0] = 1.0;
  }
  for (int i = 0; i < tp1; i++) {
    for (int j = 0; j < tp1; j++) {
      fc.lj[i][j].x = lj1[i][j];
      fc.lj[i][j].y = lj2[i][j];
      fc.lj[i][j].z = lj3[i][j];
      fc.lj[i][j].w = lj4[i][j];
      fc.cutsq[i][j] = cutsq[i][j];
    }
  }
  #ifdef _LMP_INTEL_OFFLOAD
  if (_cop < 0) return;
  flt_t * special_lj = fc.special_lj;
  flt_t * special_coul = fc.special_coul;
  flt_t * cutsq = fc.cutsq[0];
  LJ_T * lj = fc.lj[0];
  flt_t * ocutneighsq = cutneighsq[0];
  int tp1sq = tp1 * tp1;
  #pragma offload_transfer target(mic:_cop) \
    in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
    in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \
    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
  #endif
 }
 /* ---------------------------------------------------------------------- */
 template <class flt_t>
 void PairLJCharmmCoulCharmmIntel::ForceConst<flt_t>::set_ntypes(
  const int ntypes, Memory *memory, const int cop) {
  if (ntypes != _ntypes) {
    if (_ntypes > 0) {
      #ifdef _LMP_INTEL_OFFLOAD
      flt_t * ospecial_lj = special_lj;
      flt_t * ospecial_coul = special_coul;
      flt_t * ocutsq = cutsq[0];
      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
      if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
          ospecial_coul != NULL && cop >= 0) {
        #pragma offload_transfer target(mic:cop) \
          nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
          nocopy(ocutsq, olj: alloc_if(0) free_if(1))
      }
      #endif
      _memory->destroy(cutsq);
      _memory->destroy(lj);
    }
    if (ntypes > 0) {
      _cop = cop;
      memory->create(cutsq,ntypes,ntypes,"fc.cutsq");
      memory->create(lj,ntypes,ntypes,"fc.lj");
      #ifdef _LMP_INTEL_OFFLOAD
      flt_t * ospecial_lj = special_lj;
      flt_t * ospecial_coul = special_coul;
      flt_t * ocutsq = cutsq[0];
      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
      int tp1sq = ntypes*ntypes;
      if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
          ospecial_coul != NULL && cop >= 0) {
        #pragma offload_transfer target(mic:cop) \
          nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
          nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
          nocopy(ocutsq,olj: length(tp1sq) alloc_if(1) free_if(0))
      }
      #endif
    }
  }
  _ntypes=ntypes;
  _memory=memory;
 }
--- a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.h
+++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.h
@ -0,0 +1,100 @@
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 #ifdef PAIR_CLASS
 PairStyle(lj/charmm/coul/charmm/intel,PairLJCharmmCoulCharmmIntel)
 #else
 #ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_INTEL_H
 #define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_INTEL_H
 #include "pair_lj_charmm_coul_charmm.h"
 #include "fix_intel.h"
 namespace LAMMPS_NS {
 class PairLJCharmmCoulCharmmIntel : public PairLJCharmmCoulCharmm {
 public:
  PairLJCharmmCoulCharmmIntel(class LAMMPS *);
  virtual ~PairLJCharmmCoulCharmmIntel();
  virtual void compute(int, int);
  void init_style();
  typedef struct { float x,y,z; int w; } sng4_t;
 private:
  FixIntel *fix;
  int _cop, _ccache_stride;
  template <class flt_t> class ForceConst;
  template <class flt_t, class acc_t>
  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
               const ForceConst<flt_t> &fc);
  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
  void eval(const int offload, const int vflag,
            IntelBuffers<flt_t,acc_t> * buffers,
            const ForceConst<flt_t> &fc, const int astart, const int aend);
  template <class flt_t, class acc_t>
  void pack_force_const(ForceConst<flt_t> &fc,
                        IntelBuffers<flt_t, acc_t> *buffers);
  // ----------------------------------------------------------------------
  template <class flt_t>
  class ForceConst {
   public:
    _alignvar(flt_t special_coul[4],64);
    _alignvar(flt_t special_lj[4],64);
    flt_t **cutsq;
    flt_t cut_coulsq, cut_ljsq;
    flt_t cut_coul_innersq, cut_lj_innersq;
    typename IntelBuffers<flt_t,flt_t>::vec4_t **lj;
    ForceConst() : _ntypes(0) {}
    ~ForceConst() { set_ntypes(0,NULL,_cop); }
    void set_ntypes(const int ntypes, Memory *memory, const int cop);
   private:
    int _ntypes, _cop;
    Memory *_memory;
  };
  ForceConst<float> force_const_single;
  ForceConst<double> force_const_double;
 };
 }
 #endif
 #endif
 /* ERROR/WARNING messages:
 E: The 'package intel' command is required for /intel styles
 Self-explanatory.
 E: Intel varient of lj/charmm/coul/charmm expects lj cutoff<=coulombic
 The intel accelerated version of the CHARMM style requires that the
 Lennard-Jones cutoff is not greater than the coulombic cutoff.
 */
--- a/src/USER-INTEL/pair_rebo_intel.cpp
+++ b/src/USER-INTEL/pair_rebo_intel.cpp
@ -0,0 +1,42 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author: Markus Hohnerbach (RWTH)
 ------------------------------------------------------------------------- */
 #include "pair_rebo_intel.h"
 #include "error.h"
 using namespace LAMMPS_NS;
 /* ---------------------------------------------------------------------- */
 PairREBOIntel::PairREBOIntel(LAMMPS *lmp) : PairAIREBOIntel(lmp) {}
 /* ----------------------------------------------------------------------
   global settings
 ------------------------------------------------------------------------- */
 void PairREBOIntel::settings(int narg, char **arg)
 {
  if (narg != 0) error->all(FLERR,"Illegal pair_style command");
  cutlj = 0.0;
  ljflag = torflag = 0;
  //
  // this one parameter for C-C interactions is different in REBO vs AIREBO
  // see Favata, Micheletti, Ryu, Pugno, Comp Phys Comm (2016)
  PCCf_2_0 = 0.0;
 }
--- a/src/USER-INTEL/pair_rebo_intel.h
+++ b/src/USER-INTEL/pair_rebo_intel.h
@ -0,0 +1,40 @@
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author: Markus Hohnerbach (RWTH)
 ------------------------------------------------------------------------- */
 #ifdef PAIR_CLASS
 PairStyle(rebo/intel,PairREBOIntel)
 #else
 #ifndef LMP_PAIR_REBO_INTEL_H
 #define LMP_PAIR_REBO_INTEL_H
 #include "pair_airebo_intel.h"
 namespace LAMMPS_NS {
 class PairREBOIntel : public PairAIREBOIntel {
 public:
  PairREBOIntel(class LAMMPS *);
  virtual void settings(int, char **);
 };
 }
 #endif
 #endif
--- a/src/USER-INTEL/pair_sw_intel.cpp
+++ b/src/USER-INTEL/pair_sw_intel.cpp
@ -345,16 +345,17 @@ void PairSWIntel::eval(const int offload, const int vflag,
            if (jj < jnumhalf) ejnumhalf++;
          }
        }
        int ejnum_pad = ejnum;
-        while ( (ejnum_pad % pad_width) != 0) {
+	int ejrem = ejnum & (pad_width - 1);
-          tdelx[ejnum_pad] = (flt_t)0.0;
+	if (ejrem) ejrem = pad_width - ejrem;
-          tdely[ejnum_pad] = (flt_t)0.0;
+	const int ejnum_pad = ejnum + ejrem;
-          tdelz[ejnum_pad] = (flt_t)0.0;
+	for (int jj = ejnum; jj < ejnum_pad; jj++) {
-          trsq[ejnum_pad] = p2[3].cutsq + (flt_t)1.0;
+          tdelx[jj] = (flt_t)0.0;
-          tj[ejnum_pad] = nall;
+          tdely[jj] = (flt_t)0.0;
-          if (!ONETYPE) tjtype[ejnum_pad] = 0;
+          tdelz[jj] = (flt_t)0.0;
-          ejnum_pad++;
+          trsq[jj] = p2[3].cutsq + (flt_t)1.0;
          tj[jj] = nall;
          if (!ONETYPE) tjtype[jj] = 0;
        }
        #if defined(LMP_SIMD_COMPILER)