From 777e82995de257844ca6d7101acdc78219d1c139 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 5 Sep 2014 14:42:44 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12406
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/OPT/pair_eam_opt.cpp                      |  38 +++---
 src/OPT/pair_lj_charmm_coul_long_opt.cpp      |  30 ++---
 src/OPT/pair_lj_cut_opt.cpp                   |  26 ++---
 src/OPT/pair_morse_opt.cpp                    |  26 ++---
 src/USER-INTEL/README                         |   8 +-
 src/USER-INTEL/fix_intel.h                    |  44 +++----
 src/USER-INTEL/intel_buffers.cpp              |   6 +-
 src/USER-INTEL/intel_buffers.h                |   7 +-
 src/USER-INTEL/intel_preprocess.h             |  35 ++++--
 src/USER-INTEL/math_extra_intel.h             |   2 +-
 src/USER-INTEL/neigh_half_bin_intel.cpp       | 109 +++++++++---------
 src/USER-INTEL/pair_gayberne_intel.cpp        |  59 ++++++----
 src/USER-INTEL/pair_gayberne_intel.h          |   5 +-
 .../pair_lj_charmm_coul_long_intel.cpp        |  42 +++----
 .../pair_lj_charmm_coul_long_intel.h          |   4 +-
 .../pair_lj_cut_coul_long_intel.cpp           |  42 +++----
 src/USER-INTEL/pair_lj_cut_coul_long_intel.h  |   4 +-
 src/USER-INTEL/pair_lj_cut_intel.cpp          |  28 ++---
 src/USER-INTEL/pair_lj_cut_intel.h            |   2 +-
 src/USER-INTEL/verlet_intel.cpp               |  98 ++++------------
 src/USER-INTEL/verlet_intel.h                 |   3 +-
 src/USER-INTEL/verlet_split_intel.cpp         |   6 +
 src/USER-MISC/pair_list.cpp                   |   5 -
 src/USER-OMP/domain_omp.cpp                   |   5 -
 src/USER-OMP/fix_nh_asphere_omp.cpp           |   5 -
 src/USER-OMP/fix_nh_omp.cpp                   |   5 -
 src/USER-OMP/fix_nh_sphere_omp.cpp            |   5 -
 src/USER-OMP/fix_nve_omp.cpp                  |   5 -
 src/USER-OMP/fix_nvt_sllod_omp.cpp            |   5 -
 src/USER-OMP/fix_rigid_nh_omp.cpp             |   5 -
 src/USER-OMP/fix_rigid_omp.cpp                |   5 -
 src/USER-OMP/fix_rigid_small_omp.cpp          |   5 -
 src/USER-OMP/thr_omp.h                        |  10 --
 src/lmptype.h                                 |  30 +++++
 src/lmpwindows.h                              |   1 -
 35 files changed, 340 insertions(+), 375 deletions(-)

diff --git a/src/OPT/pair_eam_opt.cpp b/src/OPT/pair_eam_opt.cpp
index ba0512df39..1f20db29a2 100644
--- a/src/OPT/pair_eam_opt.cpp
+++ b/src/OPT/pair_eam_opt.cpp
@@ -81,7 +81,7 @@ void PairEAMOpt::eval()
 
   int i,j,ii,jj,inum,jnum,itype,jtype;
   double evdwl = 0.0;
-  double* __restrict__ coeff;
+  double* _noalias coeff;
 
   // grow energy array if necessary
 
@@ -93,13 +93,13 @@ void PairEAMOpt::eval()
     fp = (double *) memory->smalloc(nmax*sizeof(double),"pair:fp");
   }
 
-  double** __restrict__ x = atom->x;
-  double** __restrict__ f = atom->f;
-  int* __restrict__ type = atom->type;
+  double** _noalias x = atom->x;
+  double** _noalias f = atom->f;
+  int* _noalias type = atom->type;
   int nlocal = atom->nlocal;
 
-  vec3_t* __restrict__ xx = (vec3_t*)x[0];
-  vec3_t* __restrict__ ff = (vec3_t*)f[0];
+  vec3_t* _noalias xx = (vec3_t*)x[0];
+  vec3_t* _noalias ff = (vec3_t*)f[0];
 
   double tmp_cutforcesq = cutforcesq;
   double tmp_rdr = rdr;
@@ -107,17 +107,17 @@ void PairEAMOpt::eval()
   int nr1 = nr-1;
 
   inum = list->inum;
-  int* __restrict__ ilist = list->ilist;
-  int** __restrict__ firstneigh = list->firstneigh;
-  int* __restrict__ numneigh = list->numneigh;
+  int* _noalias ilist = list->ilist;
+  int** _noalias firstneigh = list->firstneigh;
+  int* _noalias numneigh = list->numneigh;
 
   int ntypes = atom->ntypes;
   int ntypes2 = ntypes*ntypes;
 
-  fast_alpha_t* __restrict__ fast_alpha =
+  fast_alpha_t* _noalias fast_alpha =
     (fast_alpha_t*) malloc(ntypes2*(nr+1)*sizeof(fast_alpha_t));
   for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
-    fast_alpha_t* __restrict__ tab = &fast_alpha[i*ntypes*nr+j*nr];
+    fast_alpha_t* _noalias tab = &fast_alpha[i*ntypes*nr+j*nr];
     if (type2rhor[i+1][j+1] >= 0) {
       for(int m = 1; m <= nr; m++) {
         tab[m].rhor0i =  rhor_spline[type2rhor[i+1][j+1]][m][6];
@@ -135,12 +135,12 @@ void PairEAMOpt::eval()
       }
     }
   }
-  fast_alpha_t* __restrict__ tabeight = fast_alpha;
+  fast_alpha_t* _noalias tabeight = fast_alpha;
 
-  fast_gamma_t* __restrict__ fast_gamma =
+  fast_gamma_t* _noalias fast_gamma =
     (fast_gamma_t*) malloc(ntypes2*(nr+1)*sizeof(fast_gamma_t));
   for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
-    fast_gamma_t* __restrict__ tab = &fast_gamma[i*ntypes*nr+j*nr];
+    fast_gamma_t* _noalias tab = &fast_gamma[i*ntypes*nr+j*nr];
     if (type2rhor[i+1][j+1] >= 0) {
       for(int m = 1; m <= nr; m++) {
         tab[m].rhor4i =  rhor_spline[type2rhor[i+1][j+1]][m][2];
@@ -168,7 +168,7 @@ void PairEAMOpt::eval()
       }
     }
   }
-  fast_gamma_t* __restrict__ tabss = fast_gamma;
+  fast_gamma_t* _noalias tabss = fast_gamma;
 
   // zero out density
 
@@ -188,11 +188,11 @@ void PairEAMOpt::eval()
     double ytmp = xx[i].y;
     double ztmp = xx[i].z;
     itype = type[i] - 1;
-    int* __restrict__ jlist = firstneigh[i];
+    int* _noalias jlist = firstneigh[i];
     jnum = numneigh[i];
 
     double tmprho = rho[i];
-    fast_alpha_t* __restrict__ tabeighti = &tabeight[itype*ntypes*nr];
+    fast_alpha_t* _noalias tabeighti = &tabeight[itype*ntypes*nr];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
@@ -265,14 +265,14 @@ void PairEAMOpt::eval()
     double ytmp = xx[i].y;
     double ztmp = xx[i].z;
     int itype1 = type[i] - 1;
-    int* __restrict__ jlist = firstneigh[i];
+    int* _noalias jlist = firstneigh[i];
     jnum = numneigh[i];
 
     double tmpfx = 0.0;
     double tmpfy = 0.0;
     double tmpfz = 0.0;
 
-    fast_gamma_t* __restrict__ tabssi = &tabss[itype1*ntypes*nr];
+    fast_gamma_t* _noalias tabssi = &tabss[itype1*ntypes*nr];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
diff --git a/src/OPT/pair_lj_charmm_coul_long_opt.cpp b/src/OPT/pair_lj_charmm_coul_long_opt.cpp
index 5979141e3c..136b9fc64d 100644
--- a/src/OPT/pair_lj_charmm_coul_long_opt.cpp
+++ b/src/OPT/pair_lj_charmm_coul_long_opt.cpp
@@ -84,22 +84,22 @@ void PairLJCharmmCoulLongOpt::eval()
   double evdwl = 0.0;
   double ecoul = 0.0;
 
-  double** __restrict__ x = atom->x;
-  double** __restrict__ f = atom->f;
-  double* __restrict__ q = atom->q;
-  int* __restrict__ type = atom->type;
+  double** _noalias x = atom->x;
+  double** _noalias f = atom->f;
+  double* _noalias q = atom->q;
+  int* _noalias type = atom->type;
   int nlocal = atom->nlocal;
-  double* __restrict__ special_coul = force->special_coul;
-  double* __restrict__ special_lj = force->special_lj;
+  double* _noalias special_coul = force->special_coul;
+  double* _noalias special_lj = force->special_lj;
   double qqrd2e = force->qqrd2e;
 
   inum = list->inum;
-  int* __restrict__ ilist = list->ilist;
-  int** __restrict__ firstneigh = list->firstneigh;
-  int* __restrict__ numneigh = list->numneigh;
+  int* _noalias ilist = list->ilist;
+  int** _noalias firstneigh = list->firstneigh;
+  int* _noalias numneigh = list->numneigh;
 
-  vec3_t* __restrict__ xx = (vec3_t*)x[0];
-  vec3_t* __restrict__ ff = (vec3_t*)f[0];
+  vec3_t* _noalias xx = (vec3_t*)x[0];
+  vec3_t* _noalias ff = (vec3_t*)f[0];
 
   int ntypes = atom->ntypes;
   int ntypes2 = ntypes*ntypes;
@@ -107,7 +107,7 @@ void PairLJCharmmCoulLongOpt::eval()
   double tmp_coef1 = 1.0/denom_lj;
   double tmp_coef2 = cut_ljsq - 3.0*cut_lj_innersq;
 
-  fast_alpha_t* __restrict__ fast_alpha =
+  fast_alpha_t* _noalias fast_alpha =
     (fast_alpha_t*)malloc(ntypes2*sizeof(fast_alpha_t));
   for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
     fast_alpha_t& a = fast_alpha[i*ntypes+j];
@@ -117,7 +117,7 @@ void PairLJCharmmCoulLongOpt::eval()
     a.lj3 = lj3[i+1][j+1];
     a.lj4 = lj4[i+1][j+1];
   }
-  fast_alpha_t* __restrict__ tabsix = fast_alpha;
+  fast_alpha_t* _noalias tabsix = fast_alpha;
 
   // loop over neighbors of my atoms
 
@@ -128,14 +128,14 @@ void PairLJCharmmCoulLongOpt::eval()
     double ytmp = xx[i].y;
     double ztmp = xx[i].z;
     itype = type[i] - 1;
-    int* __restrict__ jlist = firstneigh[i];
+    int* _noalias jlist = firstneigh[i];
     jnum = numneigh[i];
 
     double tmpfx = 0.0;
     double tmpfy = 0.0;
     double tmpfz = 0.0;
 
-    fast_alpha_t* __restrict__ tabsixi = (fast_alpha_t*) &tabsix[itype*ntypes];
+    fast_alpha_t* _noalias tabsixi = (fast_alpha_t*) &tabsix[itype*ntypes];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
diff --git a/src/OPT/pair_lj_cut_opt.cpp b/src/OPT/pair_lj_cut_opt.cpp
index a3d90eae50..170a3aa728 100644
--- a/src/OPT/pair_lj_cut_opt.cpp
+++ b/src/OPT/pair_lj_cut_opt.cpp
@@ -67,24 +67,24 @@ void PairLJCutOpt::eval()
   double factor_lj;
   double evdwl = 0.0;
 
-  double** __restrict__ x = atom->x;
-  double** __restrict__ f = atom->f;
-  int* __restrict__ type = atom->type;
+  double** _noalias x = atom->x;
+  double** _noalias f = atom->f;
+  int* _noalias type = atom->type;
   int nlocal = atom->nlocal;
-  double* __restrict__ special_lj = force->special_lj;
+  double* _noalias special_lj = force->special_lj;
 
   inum = list->inum;
-  int* __restrict__ ilist = list->ilist;
-  int** __restrict__ firstneigh = list->firstneigh;
-  int* __restrict__ numneigh = list->numneigh;
+  int* _noalias ilist = list->ilist;
+  int** _noalias firstneigh = list->firstneigh;
+  int* _noalias numneigh = list->numneigh;
 
-  vec3_t* __restrict__ xx = (vec3_t*)x[0];
-  vec3_t* __restrict__ ff = (vec3_t*)f[0];
+  vec3_t* _noalias xx = (vec3_t*)x[0];
+  vec3_t* _noalias ff = (vec3_t*)f[0];
 
   int ntypes = atom->ntypes;
   int ntypes2 = ntypes*ntypes;
 
-  fast_alpha_t* __restrict__ fast_alpha =
+  fast_alpha_t* _noalias fast_alpha =
     (fast_alpha_t*) malloc(ntypes2*sizeof(fast_alpha_t));
   for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
     fast_alpha_t& a = fast_alpha[i*ntypes+j];
@@ -95,7 +95,7 @@ void PairLJCutOpt::eval()
     a.lj4 = lj4[i+1][j+1];
     a.offset = offset[i+1][j+1];
   }
-  fast_alpha_t* __restrict__ tabsix = fast_alpha;
+  fast_alpha_t* _noalias tabsix = fast_alpha;
 
   // loop over neighbors of my atoms
 
@@ -105,14 +105,14 @@ void PairLJCutOpt::eval()
     double ytmp = xx[i].y;
     double ztmp = xx[i].z;
     itype = type[i] - 1;
-    int* __restrict__ jlist = firstneigh[i];
+    int* _noalias jlist = firstneigh[i];
     jnum = numneigh[i];
 
     double tmpfx = 0.0;
     double tmpfy = 0.0;
     double tmpfz = 0.0;
 
-    fast_alpha_t* __restrict__ tabsixi = (fast_alpha_t*)&tabsix[itype*ntypes];
+    fast_alpha_t* _noalias tabsixi = (fast_alpha_t*)&tabsix[itype*ntypes];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
diff --git a/src/OPT/pair_morse_opt.cpp b/src/OPT/pair_morse_opt.cpp
index c27dd053ff..6fcb9843a6 100644
--- a/src/OPT/pair_morse_opt.cpp
+++ b/src/OPT/pair_morse_opt.cpp
@@ -68,24 +68,24 @@ void PairMorseOpt::eval()
   double factor_lj;
   double evdwl = 0.0;
 
-  double** __restrict__ x = atom->x;
-  double** __restrict__ f = atom->f;
-  int* __restrict__ type = atom->type;
+  double** _noalias x = atom->x;
+  double** _noalias f = atom->f;
+  int* _noalias type = atom->type;
   int nlocal = atom->nlocal;
-  double* __restrict__ special_lj = force->special_lj;
+  double* _noalias special_lj = force->special_lj;
 
   inum = list->inum;
-  int* __restrict__ ilist = list->ilist;
-  int** __restrict__ firstneigh = list->firstneigh;
-  int* __restrict__ numneigh = list->numneigh;
+  int* _noalias ilist = list->ilist;
+  int** _noalias firstneigh = list->firstneigh;
+  int* _noalias numneigh = list->numneigh;
 
-  vec3_t* __restrict__ xx = (vec3_t*)x[0];
-  vec3_t* __restrict__ ff = (vec3_t*)f[0];
+  vec3_t* _noalias xx = (vec3_t*)x[0];
+  vec3_t* _noalias ff = (vec3_t*)f[0];
 
   int ntypes = atom->ntypes;
   int ntypes2 = ntypes*ntypes;
 
-  fast_alpha_t* __restrict__ fast_alpha =
+  fast_alpha_t* _noalias fast_alpha =
     (fast_alpha_t*) malloc(ntypes2*sizeof(fast_alpha_t));
   for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
     fast_alpha_t& a = fast_alpha[i*ntypes+j];
@@ -96,7 +96,7 @@ void PairMorseOpt::eval()
     a.d0 = d0[i+1][j+1];
     a.offset = offset[i+1][j+1];
   }
-  fast_alpha_t* __restrict__ tabsix = fast_alpha;
+  fast_alpha_t* _noalias tabsix = fast_alpha;
 
   // loop over neighbors of my atoms
 
@@ -106,14 +106,14 @@ void PairMorseOpt::eval()
     double ytmp = xx[i].y;
     double ztmp = xx[i].z;
     itype = type[i] - 1;
-    int* __restrict__ jlist = firstneigh[i];
+    int* _noalias jlist = firstneigh[i];
     jnum = numneigh[i];
 
     double tmpfx = 0.0;
     double tmpfy = 0.0;
     double tmpfz = 0.0;
 
-    fast_alpha_t* __restrict__ tabsixi = (fast_alpha_t*)&tabsix[itype*ntypes];
+    fast_alpha_t* _noalias tabsixi = (fast_alpha_t*)&tabsix[itype*ntypes];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README
index 27c60d237a..930cacdd38 100644
--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@@ -38,8 +38,6 @@ Intel compilers.
 
 -----------------------------------------------------------------------------
 
-The files in this package must be compiled with the Intel C++
-compiler, i.e. icc/icpc.
-
-
-
+For portability reasons, vectorization directives are currently only enabled 
+for Intel compilers. Using other compilers may result in significantly
+lower performance.
diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h
index 82ebc734a2..5b7d2b3926 100644
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@@ -128,7 +128,7 @@ class FixIntel : public Fix {
 
  protected:
   int _overflow_flag[5];
-  __declspec(align(64)) int _off_overflow_flag[5];
+  _alignvar(int _off_overflow_flag[5],64);
   int _allow_separate_buffers, _offload_ghost;
   #ifdef _LMP_INTEL_OFFLOAD
   double _balance_pair_time, _balance_other_time;
@@ -155,18 +155,18 @@ class FixIntel : public Fix {
   double _offload_balance, _balance_neighbor, _balance_pair, _balance_fixed;
   double _timers[NUM_ITIMERS];
   double _stopwatch[NUM_ITIMERS];
-  __declspec(align(64)) double _stopwatch_offload_neighbor[1];
-  __declspec(align(64)) double _stopwatch_offload_pair[1];
+  _alignvar(double _stopwatch_offload_neighbor[1],64);
+  _alignvar(double _stopwatch_offload_pair[1],64);
 
   template <class ft, class acc_t>
-  inline void add_results(const ft * restrict const f_in,
-                          const acc_t * restrict const ev_global,
+  inline void add_results(const ft * _noalias const f_in,
+                          const acc_t * _noalias const ev_global,
                           const int eatom, const int vatom,
 			  const int offload);
 
   template <class ft, class acc_t>
-  inline void add_oresults(const ft * restrict const f_in,
-			   const acc_t * restrict const ev_global,
+  inline void add_oresults(const ft * _noalias const f_in,
+			   const acc_t * _noalias const ev_global,
 			   const int eatom, const int vatom,
 			   const int out_offset, const int nall);
 
@@ -176,8 +176,8 @@ class FixIntel : public Fix {
   int _im_real_space_task;
   MPI_Comm _real_space_comm;
   template <class ft, class acc_t>
-  inline void add_off_results(const ft * restrict const f_in,
-                              const acc_t * restrict const ev_global);
+  inline void add_off_results(const ft * _noalias const f_in,
+                              const acc_t * _noalias const ev_global);
   #endif
 };
 
@@ -284,8 +284,8 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
 /* ---------------------------------------------------------------------- */
 
 template <class ft, class acc_t>
-void FixIntel::add_results(const ft * restrict const f_in,
-                           const acc_t * restrict const ev_global,
+void FixIntel::add_results(const ft * _noalias const f_in,
+                           const acc_t * _noalias const ev_global,
                            const int eatom, const int vatom,
 			   const int offload) {
   start_watch(TIME_PACK);
@@ -295,7 +295,7 @@ void FixIntel::add_results(const ft * restrict const f_in,
     if (offload) {
       add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
       if (force->newton_pair) {
-	const acc_t * restrict const enull = 0;
+	const acc_t * _noalias const enull = 0;
 	int offset = _offload_nlocal;
 	if (atom->torque) offset *= 2;
 	add_oresults(f_in + offset, enull, eatom, vatom, 
@@ -305,7 +305,7 @@ void FixIntel::add_results(const ft * restrict const f_in,
       add_oresults(f_in, ev_global, eatom, vatom,
 		   _host_min_local, _host_used_local);
       if (force->newton_pair) {
-	const acc_t * restrict const enull = 0;
+	const acc_t * _noalias const enull = 0;
 	int offset = _host_used_local;
 	if (atom->torque) offset *= 2;
 	add_oresults(f_in + offset, enull, eatom, 
@@ -333,11 +333,11 @@ void FixIntel::add_results(const ft * restrict const f_in,
 /* ---------------------------------------------------------------------- */
 
 template <class ft, class acc_t>
-void FixIntel::add_oresults(const ft * restrict const f_in,
-			    const acc_t * restrict const ev_global,
+void FixIntel::add_oresults(const ft * _noalias const f_in,
+			    const acc_t * _noalias const ev_global,
 			    const int eatom, const int vatom,
 			    const int out_offset, const int nall) {
-  lmp_ft * restrict const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
+  lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
   if (atom->torque) {
     if (f_in[1].w)
       if (f_in[1].w == 1)
@@ -351,12 +351,16 @@ void FixIntel::add_oresults(const ft * restrict const f_in,
   #pragma omp parallel default(none)
   #endif
   {
+    #if defined(_OPENMP)
     const int tid = omp_get_thread_num();
+    #else
+    const int tid = 0;
+    #endif
     int ifrom, ito;
     IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t));
     if (atom->torque) {
       int ii = ifrom * 2;
-      lmp_ft * restrict const tor = (lmp_ft *) lmp->atom->torque[0] +
+      lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
 	out_offset;
       if (eatom) {
         for (int i = ifrom; i < ito; i++) {
@@ -440,6 +444,7 @@ void FixIntel::balance_stamp() {
 /* ---------------------------------------------------------------------- */
 
 void FixIntel::acc_timers() {
+  _timers[TIME_OFFLOAD_PAIR] += *_stopwatch_offload_pair;
   if (neighbor->ago == 0) {
     _timers[TIME_OFFLOAD_NEIGHBOR] += *_stopwatch_offload_neighbor;
     if (_setup_time_cleared == false) {
@@ -447,7 +452,6 @@ void FixIntel::acc_timers() {
       _setup_time_cleared = true;
     }
   }
-  _timers[TIME_OFFLOAD_PAIR] += *_stopwatch_offload_pair;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -464,8 +468,8 @@ void FixIntel::set_neighbor_host_sizes() {
 /* ---------------------------------------------------------------------- */
 
 template <class ft, class acc_t>
-void FixIntel::add_off_results(const ft * restrict const f_in,
-                               const acc_t * restrict const ev_global) {
+void FixIntel::add_off_results(const ft * _noalias const f_in,
+                               const acc_t * _noalias const ev_global) {
   if (_offload_balance < 0.0)
     _balance_other_time = MPI_Wtime() - _balance_other_time;
 
diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp
index a541f0f359..d88d4cb377 100644
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@@ -22,8 +22,8 @@ using namespace LAMMPS_NS;
 
 template <class flt_t, class acc_t>
 IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
-    lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _buf_size(0),
-    _buf_local_size(0), _off_threads(0) {
+    lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _off_threads(0),
+    _buf_size(0), _buf_local_size(0) {
   _list_alloc_atoms = 0;
   _ntypes = 0;
   _off_map_maxlocal = 0;
@@ -423,6 +423,8 @@ double IntelBuffers<flt_t, acc_t>::memory_usage(const int nthreads)
   tmem += _off_map_maxlocal * sizeof(int);
   tmem += (_list_alloc_atoms + _off_threads) * get_max_nbors() * sizeof(int);
   tmem += _ntypes * _ntypes * sizeof(int);
+
+  return tmem;
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h
index bc1ca9e3b8..4b60c87f04 100644
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@@ -235,7 +235,8 @@ class IntelBuffers {
 
   double memory_usage(const int nthreads);
 
-  int _special_holder, _nspecial_holder;
+  tagint _special_holder;
+  int _nspecial_holder;
 
  protected:
   LAMMPS *lmp;
@@ -266,8 +267,8 @@ class IntelBuffers {
   #endif
   
   int _buf_size, _buf_local_size;
-  __declspec(align(64)) acc_t _ev_global[8];
-  __declspec(align(64)) acc_t _ev_global_host[8];
+  _alignvar(acc_t _ev_global[8],64);
+  _alignvar(acc_t _ev_global_host[8],64);
 
   void _grow(const int nall, const int nlocal, const int nthreads,
 	     const int offload_end);
diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h
index 49e3413e0a..da68ecc934 100644
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@@ -26,6 +26,22 @@
 
 #ifndef LAMMPS_MEMALIGN
 #error Please set -DLAMMPS_MEMALIGN=64 in CCFLAGS for your LAMMPS makefile.
+#else
+#if (LAMMPS_MEMALIGN != 64)
+#error Please set -DLAMMPS_MEMALIGN=64 in CCFLAGS for your LAMMPS makefile.
+#endif
+#endif
+
+#if defined(_OPENMP)
+#define _use_omp_pragma(txt) _Pragma(txt)
+#else
+#define _use_omp_pragma(txt)
+#endif
+
+#if defined(__INTEL_COMPILER)
+#define _use_simd_pragma(txt) _Pragma(txt)
+#else
+#define _use_simd_pragma(txt)
 #endif
 
 namespace LAMMPS_NS {
@@ -141,7 +157,7 @@ inline double MIC_Wtime() {
     if (fix->separate_buffers() && ago != 0) {				\
     fix->start_watch(TIME_PACK);					\
     if (offload) {							\
-      _Pragma("omp parallel default(none) shared(buffers,nlocal,nall)")	\
+      _use_omp_pragma("omp parallel default(none) shared(buffers,nlocal,nall)")	\
       {									\
         int ifrom, ito, tid;						\
 	int nthreads = comm->nthreads;					\
@@ -343,15 +359,16 @@ inline double MIC_Wtime() {
   else									\
     o_range = nlocal;							\
   if (offload == 0) o_range -= minlocal;				\
-    IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,		\
+    IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,	\
 			 sizeof(acc_t));				\
 									\
   int t_off = f_stride;						        \
   if (eflag && eatom) {							\
     for (int t = 1; t < nthreads; t++) {				\
-      _Pragma("vector nontemporal")					\
+      _use_simd_pragma("vector nontemporal")				\
+      _use_simd_pragma("novector")					\
       for (int n = iifrom; n < iito; n++) {				\
-        f_start[n].x += f_start[n + t_off].x;				\ 
+        f_start[n].x += f_start[n + t_off].x;				\
         f_start[n].y += f_start[n + t_off].y;				\
 	f_start[n].z += f_start[n + t_off].z;				\
 	f_start[n].w += f_start[n + t_off].w;				\
@@ -360,8 +377,9 @@ inline double MIC_Wtime() {
     }									\
   } else {								\
     for (int t = 1; t < nthreads; t++) {				\
-      _Pragma("vector nontemporal")   					\
-      for (int n = iifrom; n < iito; n++) {                             \ 
+      _use_simd_pragma("vector nontemporal")  				\
+      _use_simd_pragma("novector")					\
+      for (int n = iifrom; n < iito; n++) {                             \
 	f_start[n].x += f_start[n + t_off].x;                  	        \
         f_start[n].y += f_start[n + t_off].y;				\
         f_start[n].z += f_start[n + t_off].z;				\
@@ -372,8 +390,9 @@ inline double MIC_Wtime() {
 									\
   if (evflag) {								\
     if (vflag == 2) {							\
-      const ATOM_T * restrict const xo = x + minlocal;			\
-      _Pragma("vector nontemporal")   					\
+      const ATOM_T * _noalias const xo = x + minlocal;			\
+      _use_simd_pragma("vector nontemporal")   				\
+      _use_simd_pragma("novector")					\
       for (int n = iifrom; n < iito; n++) {				\
 	ov0 += f_start[n].x * xo[n].x;					\
 	ov1 += f_start[n].y * xo[n].y;					\
diff --git a/src/USER-INTEL/math_extra_intel.h b/src/USER-INTEL/math_extra_intel.h
index 62163b3f60..780d75b79b 100644
--- a/src/USER-INTEL/math_extra_intel.h
+++ b/src/USER-INTEL/math_extra_intel.h
@@ -287,7 +287,7 @@
   if (fabs(aug_8) > fabs(aug_0)) {			\
     flt_t swapt;					\
     swapt = aug_0; aug_0 = aug_8; aug_8 = swapt;	\
-    swapt = aug_1; aug_1 = aug_9; aug_9 = swapt;        \ 
+    swapt = aug_1; aug_1 = aug_9; aug_9 = swapt;        \
     swapt = aug_2; aug_2 = aug_10; aug_10 = swapt;      \
     swapt = aug_3; aug_3 = aug_11; aug_11 = swapt;      \
   }							\
diff --git a/src/USER-INTEL/neigh_half_bin_intel.cpp b/src/USER-INTEL/neigh_half_bin_intel.cpp
index a5f12a56f9..162a73c83f 100644
--- a/src/USER-INTEL/neigh_half_bin_intel.cpp
+++ b/src/USER-INTEL/neigh_half_bin_intel.cpp
@@ -79,7 +79,7 @@ inline int mcoord2bin(const flt_t x0, const flt_t x1, const flt_t x2,
   const int n1 = nspecial[i * 3];                                     \
   const int n2 = nspecial[i * 3 + 1];                                 \
   const int n3 = nspecial[i * 3 + 2];                                 \
-  const int *sptr = special + i * maxspecial;                         \
+  const tagint *sptr = special + i * maxspecial;                      \
   for (int s = 0; s < n3; s++) {                                      \
     if (sptr[s] == tag) {                                             \
       if (s < n1) {                                                   \
@@ -105,7 +105,7 @@ inline int mcoord2bin(const flt_t x0, const flt_t x1, const flt_t x2,
 
 template <class flt_t, class acc_t>
 void Neighbor::bin_atoms(void * xin) {
-  const ATOM_T * restrict const x = (const ATOM_T * restrict const)xin;
+  const ATOM_T * _noalias const x = (const ATOM_T * _noalias const)xin;
   int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
 
@@ -243,11 +243,12 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
     return;
   }
 
-  const ATOM_T * restrict const x = buffers->get_x();
-  int * restrict const firstneigh = buffers->firstneigh(list);
+  const ATOM_T * _noalias const x = buffers->get_x();
+  int * _noalias const firstneigh = buffers->firstneigh(list);
 
   const int molecular = atom->molecular;
-  int *ns = NULL, *s = NULL;
+  int *ns = NULL;
+  tagint *s = NULL;
   int tag_size, special_size;
   if (molecular) {
     s = atom->special[0];
@@ -260,23 +261,23 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
     tag_size = 0;
     special_size = 0;
   }
-  const int * restrict const special = s;
-  const int * restrict const nspecial = ns;
+  const tagint * _noalias const special = s;
+  const int * _noalias const nspecial = ns;
   const int maxspecial = atom->maxspecial;
-  const int * restrict const tag = atom->tag;
+  const tagint * _noalias const tag = atom->tag;
 
-  int * restrict const ilist = list->ilist;
-  int * restrict numneigh = list->numneigh;
-  int * restrict const cnumneigh = buffers->cnumneigh(list);
+  int * _noalias const ilist = list->ilist;
+  int * _noalias numneigh = list->numneigh;
+  int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int nstencil = list->nstencil;
-  const int * restrict const stencil = list->stencil;
-  const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0];
+  const int * _noalias const stencil = list->stencil;
+  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
   const int ntypes = atom->ntypes + 1;
   const int nlocal = atom->nlocal;
 
   #ifndef _LMP_INTEL_OFFLOAD
   int * const mask = atom->mask;
-  int * const molecule = atom->molecule;
+  tagint * const molecule = atom->molecule;
   #endif
 
   int tnum;
@@ -316,8 +317,8 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
-  const int * restrict const binhead = this->binhead;
-  const int * restrict const special_flag = this->special_flag;
+  const int * _noalias const binhead = this->binhead;
+  const int * _noalias const special_flag = this->special_flag;
   const int nbinx = this->nbinx;
   const int nbiny = this->nbiny;
   const int nbinz = this->nbinz;
@@ -327,7 +328,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
   const int mbinx = this->mbinx;
   const int mbiny = this->mbiny;
   const int mbinz = this->mbinz;
-  const int * restrict const bins = this->bins;
+  const int * _noalias const bins = this->bins;
   const int cop = fix->coprocessor_number();
   const int separate_buffers = fix->separate_buffers();
   #pragma offload target(mic:cop) if(offload) \
@@ -486,7 +487,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
 
       if (molecular) {
         for (int i = ifrom; i < ito; ++i) {
-          int * restrict jlist = firstneigh + cnumneigh[i];
+          int * _noalias jlist = firstneigh + cnumneigh[i];
           const int jnum = numneigh[i];
           for (int jj = 0; jj < jnum; jj++) {
             const int j = jlist[jj];
@@ -507,7 +508,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
       #ifdef _LMP_INTEL_OFFLOAD
       else if (separate_buffers) {
 	for (int i = ifrom; i < ito; ++i) {
-          int * restrict jlist = firstneigh + cnumneigh[i];
+          int * _noalias jlist = firstneigh + cnumneigh[i];
           const int jnum = numneigh[i];
 	  int jj = 0;
 	  for (jj = 0; jj < jnum; jj++)
@@ -662,14 +663,15 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
     return;
   }
 
-  const ATOM_T * restrict const x = buffers->get_x();
-  int * restrict const firstneigh = buffers->firstneigh(list);
+  const ATOM_T * _noalias const x = buffers->get_x();
+  int * _noalias const firstneigh = buffers->firstneigh(list);
   int nall_t = nall;
   if (offload_noghost && offload) nall_t = atom->nlocal;
   const int e_nall = nall_t;
 
   const int molecular = atom->molecular;
-  int *ns = NULL, *s = NULL;
+  int *ns = NULL;
+  tagint *s = NULL;
   int tag_size, special_size;
   if (molecular) {
     s = atom->special[0];
@@ -682,23 +684,23 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
     tag_size = 0;
     special_size = 0;
   }
-  const int * restrict const special = s;
-  const int * restrict const nspecial = ns;
+  const tagint * _noalias const special = s;
+  const int * _noalias const nspecial = ns;
   const int maxspecial = atom->maxspecial;
-  const int * restrict const tag = atom->tag;
+  const tagint * _noalias const tag = atom->tag;
 
-  int * restrict const ilist = list->ilist;
-  int * restrict numneigh = list->numneigh;
-  int * restrict const cnumneigh = buffers->cnumneigh(list);
+  int * _noalias const ilist = list->ilist;
+  int * _noalias numneigh = list->numneigh;
+  int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int nstencil = list->nstencil;
-  const int * restrict const stencil = list->stencil;
-  const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0];
+  const int * _noalias const stencil = list->stencil;
+  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
   const int ntypes = atom->ntypes + 1;
   const int nlocal = atom->nlocal;
 
   #ifndef _LMP_INTEL_OFFLOAD
   int * const mask = atom->mask;
-  int * const molecule = atom->molecule;
+  tagint * const molecule = atom->molecule;
   #endif
 
   int tnum;
@@ -737,8 +739,8 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
-  const int * restrict const binhead = this->binhead;
-  const int * restrict const special_flag = this->special_flag;
+  const int * _noalias const binhead = this->binhead;
+  const int * _noalias const special_flag = this->special_flag;
   const int nbinx = this->nbinx;
   const int nbiny = this->nbiny;
   const int nbinz = this->nbinz;
@@ -748,7 +750,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
   const int mbinx = this->mbinx;
   const int mbiny = this->mbiny;
   const int mbinz = this->mbinz;
-  const int * restrict const bins = this->bins;
+  const int * _noalias const bins = this->bins;
   const int cop = fix->coprocessor_number();
   const int separate_buffers = fix->separate_buffers();
   #pragma offload target(mic:cop) if(offload) \
@@ -948,7 +950,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
 
       if (molecular) {
         for (int i = ifrom; i < ito; ++i) {
-          int * restrict jlist = firstneigh + cnumneigh[i];
+          int * _noalias jlist = firstneigh + cnumneigh[i];
           const int jnum = numneigh[i];
           for (int jj = 0; jj < jnum; jj++) {
             const int j = jlist[jj];
@@ -970,7 +972,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
       #ifdef _LMP_INTEL_OFFLOAD
       else if (separate_buffers) {
 	for (int i = ifrom; i < ito; ++i) {
-          int * restrict jlist = firstneigh + cnumneigh[i];
+          int * _noalias jlist = firstneigh + cnumneigh[i];
           const int jnum = numneigh[i];
 	  int jj = 0;
 	  for (jj = 0; jj < jnum; jj++)
@@ -1127,14 +1129,15 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
     return;
   }
 
-  const ATOM_T * restrict const x = buffers->get_x();
-  int * restrict const firstneigh = buffers->firstneigh(list);
+  const ATOM_T * _noalias const x = buffers->get_x();
+  int * _noalias const firstneigh = buffers->firstneigh(list);
   int nall_t = nall;
   if (offload_noghost && offload) nall_t = atom->nlocal;
   const int e_nall = nall_t;
 
   const int molecular = atom->molecular;
-  int *ns = NULL, *s = NULL;
+  int *ns = NULL;
+  tagint *s = NULL;
   int tag_size, special_size;
   if (molecular) {
     s = atom->special[0];
@@ -1147,23 +1150,23 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
     tag_size = 0;
     special_size = 0;
   }
-  const int * restrict const special = s;
-  const int * restrict const nspecial = ns;
+  const tagint * _noalias const special = s;
+  const int * _noalias const nspecial = ns;
   const int maxspecial = atom->maxspecial;
-  const int * restrict const tag = atom->tag;
+  const tagint * _noalias const tag = atom->tag;
 
-  int * restrict const ilist = list->ilist;
-  int * restrict numneigh = list->numneigh;
-  int * restrict const cnumneigh = buffers->cnumneigh(list);
+  int * _noalias const ilist = list->ilist;
+  int * _noalias numneigh = list->numneigh;
+  int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int nstencil = list->nstencil;
-  const int * restrict const stencil = list->stencil;
-  const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0];
+  const int * _noalias const stencil = list->stencil;
+  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
   const int ntypes = atom->ntypes + 1;
   const int nlocal = atom->nlocal;
 
   #ifndef _LMP_INTEL_OFFLOAD
   int * const mask = atom->mask;
-  int * const molecule = atom->molecule;
+  tagint * const molecule = atom->molecule;
   #endif
 
   int tnum;
@@ -1202,8 +1205,8 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
-  const int * restrict const binhead = this->binhead;
-  const int * restrict const special_flag = this->special_flag;
+  const int * _noalias const binhead = this->binhead;
+  const int * _noalias const special_flag = this->special_flag;
   const int nbinx = this->nbinx;
   const int nbiny = this->nbiny;
   const int nbinz = this->nbinz;
@@ -1213,7 +1216,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
   const int mbinx = this->mbinx;
   const int mbiny = this->mbiny;
   const int mbinz = this->mbinz;
-  const int * restrict const bins = this->bins;
+  const int * _noalias const bins = this->bins;
   const int cop = fix->coprocessor_number();
   const int separate_buffers = fix->separate_buffers();
   #pragma offload target(mic:cop) if(offload) \
@@ -1386,7 +1389,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
 
       if (molecular) {
         for (int i = ifrom; i < ito; ++i) {
-          int * restrict jlist = firstneigh + cnumneigh[i];
+          int * _noalias jlist = firstneigh + cnumneigh[i];
           const int jnum = numneigh[i];
           for (int jj = 0; jj < jnum; jj++) {
             const int j = jlist[jj];
@@ -1407,7 +1410,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
       #ifdef _LMP_INTEL_OFFLOAD
       else if (separate_buffers) {
 	for (int i = ifrom; i < ito; ++i) {
-          int * restrict jlist = firstneigh + cnumneigh[i];
+          int * _noalias jlist = firstneigh + cnumneigh[i];
           const int jnum = numneigh[i];
 	  int jj = 0;
 	  for (jj = 0; jj < jnum; jj++)
diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp
index 46e608c92f..aae42a7145 100644
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@@ -79,7 +79,7 @@ void PairGayBerneIntel::compute(int eflag, int vflag,
     fix->start_watch(TIME_PACK);
     const AtomVecEllipsoid::Bonus * const bonus = avec->bonus;
     const int * const ellipsoid = atom->ellipsoid;
-    QUAT_T * restrict const quat = buffers->get_quat();
+    QUAT_T * _noalias const quat = buffers->get_quat();
     #if defined(_OPENMP)
     #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
     #endif
@@ -150,8 +150,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
   fix->get_buffern(offload, nlocal, nall, minlocal);
 
   const int ago = neighbor->ago;
-  ATOM_T * restrict const x = buffers->get_x(offload);
-  QUAT_T * restrict const quat = buffers->get_quat(offload);
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  QUAT_T * _noalias const quat = buffers->get_quat(offload);
   const AtomVecEllipsoid::Bonus *bonus = avec->bonus;
   const int *ellipsoid = atom->ellipsoid;
 
@@ -225,15 +225,15 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
   }									
   #endif
 
-  //  const int * restrict const ilist = list->ilist;
-  const int * restrict const numneigh = list->numneigh;
-  const int * restrict const cnumneigh = buffers->cnumneigh(list);
-  const int * restrict const firstneigh = buffers->firstneigh(list);
-  const flt_t * restrict const special_lj = fc.special_lj;
+  //  const int * _noalias const ilist = list->ilist;
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const flt_t * _noalias const special_lj = fc.special_lj;
 
-  const FC_PACKED1_T * restrict const ijc = fc.ijc[0];
-  const FC_PACKED2_T * restrict const lj34 = fc.lj34[0];
-  const FC_PACKED3_T * restrict const ic = fc.ic;
+  const FC_PACKED1_T * _noalias const ijc = fc.ijc[0];
+  const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
+  const FC_PACKED3_T * _noalias const ic = fc.ic;
   const flt_t mu = fc.mu;
   const flt_t gamma = fc.gamma;
   const flt_t upsilon = fc.upsilon;
@@ -255,8 +255,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 		       x_size, q_size, ev_size, f_stride);
 
   int tc;
-  FORCE_T * restrict f_start;
-  acc_t * restrict ev_global;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
   const int max_nbors = _max_nbors;
   const int nthreads = tc;
@@ -351,25 +351,25 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * restrict const f = f_start - minlocal * 2 + (tid * f_stride);
+      FORCE_T * _noalias const f = f_start - minlocal * 2 + (tid * f_stride);
       memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
 
-      flt_t * restrict const rsq_form = rsq_formi + tid * max_nbors;
-      flt_t * restrict const delx_form = delx_formi + tid * max_nbors;
-      flt_t * restrict const dely_form = dely_formi + tid * max_nbors;
-      flt_t * restrict const delz_form = delz_formi + tid * max_nbors;
-      int * restrict const jtype_form = jtype_formi + tid * max_nbors;
-      int * restrict const jlist_form = jlist_formi + tid * max_nbors;
+      flt_t * _noalias const rsq_form = rsq_formi + tid * max_nbors;
+      flt_t * _noalias const delx_form = delx_formi + tid * max_nbors;
+      flt_t * _noalias const dely_form = dely_formi + tid * max_nbors;
+      flt_t * _noalias const delz_form = delz_formi + tid * max_nbors;
+      int * _noalias const jtype_form = jtype_formi + tid * max_nbors;
+      int * _noalias const jlist_form = jlist_formi + tid * max_nbors;
 
       int ierror = 0;
       for (int i = iifrom; i < iito; ++i) {
         // const int i = ilist[ii];
         const int itype = x[i].w;
         const int ptr_off = itype * ntypes;
-        const FC_PACKED1_T * restrict const ijci = ijc + ptr_off;
-        const FC_PACKED2_T * restrict const lj34i = lj34 + ptr_off;
+        const FC_PACKED1_T * _noalias const ijci = ijc + ptr_off;
+        const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
 
-        const int * restrict const jlist = firstneigh + cnumneigh[i];
+        const int * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         const flt_t xtmp = x[i].x;
@@ -433,9 +433,11 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 	__assume(packed_j % 8 == 0);
 	__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
 	#endif
+        #if defined(__INTEL_COMPILER)
         #pragma vector aligned
 	#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
 	                         sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
+        #endif
         for (int jj = 0; jj < packed_j; jj++) {
           flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
           flt_t b2_0, b2_1, b2_2, b2_3, b2_4, b2_5, b2_6, b2_7, b2_8;
@@ -796,7 +798,10 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
       int t_off = f_stride;
       if (EFLAG && eatom) {
         for (int t = 1; t < nthreads; t++) {
+          #if defined(__INTEL_COMPILER)
 	  #pragma vector nontemporal
+	  #pragma novector
+          #endif
           for (int n = iifrom * 2; n < two_iito; n++) {
             f_start[n].x += f_start[n + t_off].x;
             f_start[n].y += f_start[n + t_off].y;
@@ -807,7 +812,10 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
         }
       } else {
         for (int t = 1; t < nthreads; t++) {
+          #if defined(__INTEL_COMPILER)
 	  #pragma vector nontemporal
+	  #pragma novector
+          #endif
           for (int n = iifrom * 2; n < two_iito; n++) {
             f_start[n].x += f_start[n + t_off].x;
             f_start[n].y += f_start[n + t_off].y;
@@ -819,8 +827,11 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 
       if (EVFLAG) {
         if (vflag==2) {
-          const ATOM_T * restrict const xo = x + minlocal;
+          const ATOM_T * _noalias const xo = x + minlocal;
+          #if defined(__INTEL_COMPILER)
 	  #pragma vector nontemporal
+	  #pragma novector
+          #endif
           for (int n = iifrom; n < iito; n++) {
             const int nt2 = n * 2;
             ov0 += f_start[nt2].x * xo[n].x;
diff --git a/src/USER-INTEL/pair_gayberne_intel.h b/src/USER-INTEL/pair_gayberne_intel.h
index eb055e151e..9a4aae6c72 100644
--- a/src/USER-INTEL/pair_gayberne_intel.h
+++ b/src/USER-INTEL/pair_gayberne_intel.h
@@ -62,7 +62,10 @@ class PairGayBerneIntel : public PairGayBerne {
     typedef struct { flt_t lj3, lj4; } fc_packed2;
     typedef struct { flt_t shape2[4], well[4]; } fc_packed3;
 
-    __declspec(align(64)) flt_t special_lj[4], gamma, upsilon, mu;
+    _alignvar(flt_t special_lj[4],64);
+    _alignvar(flt_t gamma,64);
+    _alignvar(flt_t upsilon,64);
+    _alignvar(flt_t mu,64);
     fc_packed1 **ijc;
     fc_packed2 **lj34;
     fc_packed3 *ic;
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
index 576d5b21c7..8d23e8f589 100644
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
@@ -143,25 +143,25 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
-  ATOM_T * restrict const x = buffers->get_x(offload);
-  flt_t * restrict const q = buffers->get_q(offload);
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  flt_t * _noalias const q = buffers->get_q(offload);
 
-  const int * restrict const numneigh = list->numneigh;
-  const int * restrict const cnumneigh = buffers->cnumneigh(list);
-  const int * restrict const firstneigh = buffers->firstneigh(list);
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
 
-  const flt_t * restrict const special_coul = fc.special_coul;
-  const flt_t * restrict const special_lj = fc.special_lj;
+  const flt_t * _noalias const special_coul = fc.special_coul;
+  const flt_t * _noalias const special_lj = fc.special_lj;
   const flt_t qqrd2e = force->qqrd2e;
   const flt_t inv_denom_lj = (flt_t)1.0/denom_lj;
 
-  const flt_t * restrict const cutsq = fc.cutsq[0];
-  const LJ_T * restrict const lj = fc.lj[0];
-  const TABLE_T * restrict const table = fc.table;
-  const flt_t * restrict const etable = fc.etable;
-  const flt_t * restrict const detable = fc.detable;
-  const flt_t * restrict const ctable = fc.ctable;
-  const flt_t * restrict const dctable = fc.dctable;
+  const flt_t * _noalias const cutsq = fc.cutsq[0];
+  const LJ_T * _noalias const lj = fc.lj[0];
+  const TABLE_T * _noalias const table = fc.table;
+  const flt_t * _noalias const etable = fc.etable;
+  const flt_t * _noalias const detable = fc.detable;
+  const flt_t * _noalias const ctable = fc.ctable;
+  const flt_t * _noalias const dctable = fc.dctable;
   const flt_t cut_ljsq = fc.cut_ljsq;
   const flt_t cut_lj_innersq = fc.cut_lj_innersq;
   const flt_t cut_coulsq = fc.cut_coulsq;
@@ -178,8 +178,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
 		       x_size, q_size, ev_size, f_stride);
 		       
   int tc;
-  FORCE_T * restrict f_start;
-  acc_t * restrict ev_global;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
 
   const int nthreads = tc;
@@ -242,7 +242,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride);
+      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
       memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
       flt_t cutboth = cut_coulsq;
 
@@ -251,10 +251,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
-        const flt_t * restrict const cutsqi = cutsq + ptr_off;
-        const LJ_T * restrict const lji = lj + ptr_off;
+        const flt_t * _noalias const cutsqi = cutsq + ptr_off;
+        const LJ_T * _noalias const lji = lj + ptr_off;
 
-        const int   * restrict const jlist = firstneigh + cnumneigh[i];
+        const int   * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
@@ -270,9 +270,11 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
 	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 	}
 
+        #if defined(__INTEL_COMPILER)
 	#pragma vector aligned
 	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
 	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcecoul, forcelj, evdwl, ecoul;
           forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
index ad66c786b6..dbdd6982ab 100644
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
@@ -62,8 +62,8 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
   class ForceConst {
    public:
     typedef struct { flt_t r, dr, f, df; } table_t;
-    __declspec(align(64)) flt_t special_coul[4];
-    __declspec(align(64)) flt_t special_lj[4];
+    _alignvar(flt_t special_coul[4],64);
+    _alignvar(flt_t special_lj[4],64);
     flt_t **cutsq, g_ewald, tabinnersq;
     flt_t cut_coulsq, cut_ljsq;
     flt_t cut_lj_innersq;
diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
index 4163a1f7d2..582ad7eb85 100644
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
@@ -143,24 +143,24 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
-  ATOM_T * restrict const x = buffers->get_x(offload);
-  flt_t * restrict const q = buffers->get_q(offload);
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  flt_t * _noalias const q = buffers->get_q(offload);
 
-  const int * restrict const numneigh = list->numneigh;
-  const int * restrict const cnumneigh = buffers->cnumneigh(list);
-  const int * restrict const firstneigh = buffers->firstneigh(list);
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
 
-  const flt_t * restrict const special_coul = fc.special_coul;
-  const flt_t * restrict const special_lj = fc.special_lj;
+  const flt_t * _noalias const special_coul = fc.special_coul;
+  const flt_t * _noalias const special_lj = fc.special_lj;
   const flt_t qqrd2e = force->qqrd2e;
 
-  const C_FORCE_T * restrict const c_force = fc.c_force[0];
-  const C_ENERGY_T * restrict const c_energy = fc.c_energy[0];
-  const TABLE_T * restrict const table = fc.table;
-  const flt_t * restrict const etable = fc.etable;
-  const flt_t * restrict const detable = fc.detable;
-  const flt_t * restrict const ctable = fc.ctable;
-  const flt_t * restrict const dctable = fc.dctable;
+  const C_FORCE_T * _noalias const c_force = fc.c_force[0];
+  const C_ENERGY_T * _noalias const c_energy = fc.c_energy[0];
+  const TABLE_T * _noalias const table = fc.table;
+  const flt_t * _noalias const etable = fc.etable;
+  const flt_t * _noalias const detable = fc.detable;
+  const flt_t * _noalias const ctable = fc.ctable;
+  const flt_t * _noalias const dctable = fc.dctable;
   const flt_t g_ewald = fc.g_ewald;
   const flt_t tabinnersq = fc.tabinnersq;
 
@@ -174,8 +174,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
 		       x_size, q_size, ev_size, f_stride);
 
   int tc;
-  FORCE_T * restrict f_start;
-  acc_t * restrict ev_global;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
 
   const int nthreads = tc;
@@ -237,17 +237,17 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride);
+      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
       memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       for (int i = iifrom; i < iito; ++i) {
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
-        const C_FORCE_T * restrict const c_forcei = c_force + ptr_off;
-        const C_ENERGY_T * restrict const c_energyi = c_energy + ptr_off;
+        const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
+        const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
 
-        const int   * restrict const jlist = firstneigh + cnumneigh[i];
+        const int   * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
@@ -263,9 +263,11 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
 	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 	}
 
+        #if defined(__INTEL_COMPILER)
 	#pragma vector aligned
 	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
 	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcecoul, forcelj, evdwl, ecoul;
           forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
index d7b4282a99..641c35660c 100644
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
@@ -64,8 +64,8 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong {
     typedef struct { flt_t cutsq, cut_ljsq, lj1, lj2; } c_force_t;
     typedef struct { flt_t lj3, lj4, offset, pad; } c_energy_t;
     typedef struct { flt_t r, dr, f, df; } table_t;
-    __declspec(align(64)) flt_t special_coul[4];
-    __declspec(align(64)) flt_t special_lj[4];
+    _alignvar(flt_t special_coul[4],64);
+    _alignvar(flt_t special_lj[4],64);
     flt_t g_ewald, tabinnersq;
     c_force_t **c_force;
     c_energy_t **c_energy;
diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp
index bca3a73493..897abdde53 100644
--- a/src/USER-INTEL/pair_lj_cut_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_intel.cpp
@@ -134,14 +134,14 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
-  ATOM_T * restrict const x = buffers->get_x(offload);
+  ATOM_T * _noalias const x = buffers->get_x(offload);
 
-  const int * restrict const numneigh = list->numneigh;
-  const int * restrict const cnumneigh = buffers->cnumneigh(list);
-  const int * restrict const firstneigh = buffers->firstneigh(list);
-  const flt_t * restrict const special_lj = fc.special_lj;
-  const FC_PACKED1_T * restrict const ljc12o = fc.ljc12o[0];
-  const FC_PACKED2_T * restrict const lj34 = fc.lj34[0];
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const flt_t * _noalias const special_lj = fc.special_lj;
+  const FC_PACKED1_T * _noalias const ljc12o = fc.ljc12o[0];
+  const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
@@ -153,8 +153,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
 		       x_size, q_size, ev_size, f_stride);
 
   int tc;
-  FORCE_T * restrict f_start;
-  acc_t * restrict ev_global;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
   int *overflow = fix->get_off_overflow_flag();
@@ -184,17 +184,17 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride);
+      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
       memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       for (int i = iifrom; i < iito; ++i) {
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
-        const FC_PACKED1_T * restrict const ljc12oi = ljc12o + ptr_off;
-        const FC_PACKED2_T * restrict const lj34i = lj34 + ptr_off;
+        const FC_PACKED1_T * _noalias const ljc12oi = ljc12o + ptr_off;
+        const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
 
-        const int * restrict const jlist = firstneigh + cnumneigh[i];
+        const int * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         acc_t fxtmp, fytmp, fztmp, fwtmp;
@@ -209,9 +209,11 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
         }
 
+        #if defined(__INTEL_COMPILER)
         #pragma vector aligned
 	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
 	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcelj, evdwl;
           forcelj = evdwl = (flt_t)0.0;
diff --git a/src/USER-INTEL/pair_lj_cut_intel.h b/src/USER-INTEL/pair_lj_cut_intel.h
index a40e39af56..13737affab 100644
--- a/src/USER-INTEL/pair_lj_cut_intel.h
+++ b/src/USER-INTEL/pair_lj_cut_intel.h
@@ -62,7 +62,7 @@ class PairLJCutIntel : public PairLJCut {
     typedef struct { flt_t cutsq, lj1, lj2, offset; } fc_packed1;
     typedef struct { flt_t lj3, lj4; } fc_packed2;
 
-    __declspec(align(64)) flt_t special_lj[4];
+    _alignvar(flt_t special_lj[4],64);
     fc_packed1 **ljc12o;
     fc_packed2 **lj34;
 
diff --git a/src/USER-INTEL/verlet_intel.cpp b/src/USER-INTEL/verlet_intel.cpp
index 64177e0f05..5bfd04639c 100644
--- a/src/USER-INTEL/verlet_intel.cpp
+++ b/src/USER-INTEL/verlet_intel.cpp
@@ -17,6 +17,7 @@
 #include "domain.h"
 #include "comm.h"
 #include "atom.h"
+#include "atom_vec.h"
 #include "force.h"
 #include "pair.h"
 #include "bond.h"
@@ -81,14 +82,9 @@ void VerletIntel::init()
   // set flags for what arrays to clear in force_clear()
   // need to clear additionals arrays if they exist
 
-  torqueflag = 0;
+  torqueflag = extraflag = 0;
   if (atom->torque_flag) torqueflag = 1;
-  erforceflag = 0;
-  if (atom->erforce_flag) erforceflag = 1;
-  e_flag = 0;
-  if (atom->e_flag) e_flag = 1;
-  rho_flag = 0;
-  if (atom->rho_flag) rho_flag = 1;
+  if (atom->avec->forceclearflag) extraflag = 1;
 
   // orthogonal vs triclinic simulation box
 
@@ -388,7 +384,7 @@ void VerletIntel::cleanup()
 
 void VerletIntel::force_clear()
 {
-  int i;
+  size_t nbytes;
 
   if (external_force_clear) return;
 
@@ -396,19 +392,16 @@ void VerletIntel::force_clear()
   // if either newton flag is set, also include ghosts
   // when using threads always clear all forces.
 
-  if (neighbor->includegroup == 0) {
-    int nall;
-    if (force->newton) nall = atom->nlocal + atom->nghost;
-    else nall = atom->nlocal;
+  int nlocal = atom->nlocal;
 
-    size_t nbytes = sizeof(double) * nall;
+  if (neighbor->includegroup == 0) {
+    nbytes = sizeof(double) * nlocal;
+    if (force->newton) nbytes += sizeof(double) * atom->nghost;
 
     if (nbytes) {
-      memset(&(atom->f[0][0]),0,3*nbytes);
-      if (torqueflag)  memset(&(atom->torque[0][0]),0,3*nbytes);
-      if (erforceflag) memset(&(atom->erforce[0]),  0,  nbytes);
-      if (e_flag)      memset(&(atom->de[0]),       0,  nbytes);
-      if (rho_flag)    memset(&(atom->drho[0]),     0,  nbytes);
+      memset(&atom->f[0][0],0,3*nbytes);
+      if (torqueflag) memset(&atom->torque[0][0],0,3*nbytes);
+      if (extraflag) atom->avec->force_clear(0,nbytes);
     }
 
   // neighbor includegroup flag is set
@@ -416,70 +409,21 @@ void VerletIntel::force_clear()
   // if either newton flag is set, also include ghosts
 
   } else {
-    int nall = atom->nfirst;
+    nbytes = sizeof(double) * atom->nfirst;
 
-    double **f = atom->f;
-    for (i = 0; i < nall; i++) {
-      f[i][0] = 0.0;
-      f[i][1] = 0.0;
-      f[i][2] = 0.0;
-    }
-
-    if (torqueflag) {
-      double **torque = atom->torque;
-      for (i = 0; i < nall; i++) {
-        torque[i][0] = 0.0;
-        torque[i][1] = 0.0;
-        torque[i][2] = 0.0;
-      }
-    }
-
-    if (erforceflag) {
-      double *erforce = atom->erforce;
-      for (i = 0; i < nall; i++) erforce[i] = 0.0;
-    }
-
-    if (e_flag) {
-      double *de = atom->de;
-      for (i = 0; i < nall; i++) de[i] = 0.0;
-    }
-
-    if (rho_flag) {
-      double *drho = atom->drho;
-      for (i = 0; i < nall; i++) drho[i] = 0.0;
+    if (nbytes) {
+      memset(&atom->f[0][0],0,3*nbytes);
+      if (torqueflag) memset(&atom->torque[0][0],0,3*nbytes);
+      if (extraflag) atom->avec->force_clear(0,nbytes);
     }
 
     if (force->newton) {
-      nall = atom->nlocal + atom->nghost;
+      nbytes = sizeof(double) * atom->nghost;
 
-      for (i = atom->nlocal; i < nall; i++) {
-        f[i][0] = 0.0;
-        f[i][1] = 0.0;
-        f[i][2] = 0.0;
-      }
-
-      if (torqueflag) {
-        double **torque = atom->torque;
-        for (i = atom->nlocal; i < nall; i++) {
-          torque[i][0] = 0.0;
-          torque[i][1] = 0.0;
-          torque[i][2] = 0.0;
-        }
-      }
-
-      if (erforceflag) {
-        double *erforce = atom->erforce;
-        for (i = atom->nlocal; i < nall; i++) erforce[i] = 0.0;
-      }
-
-      if (e_flag) {
-        double *de = atom->de;
-        for (i = 0; i < nall; i++) de[i] = 0.0;
-      }
-
-      if (rho_flag) {
-        double *drho = atom->drho;
-        for (i = 0; i < nall; i++) drho[i] = 0.0;
+      if (nbytes) {
+        memset(&atom->f[nlocal][0],0,3*nbytes);
+        if (torqueflag) memset(&atom->torque[nlocal][0],0,3*nbytes);
+        if (extraflag) atom->avec->force_clear(nlocal,nbytes);
       }
     }
   }
diff --git a/src/USER-INTEL/verlet_intel.h b/src/USER-INTEL/verlet_intel.h
index de4231431d..166dc137d0 100644
--- a/src/USER-INTEL/verlet_intel.h
+++ b/src/USER-INTEL/verlet_intel.h
@@ -39,8 +39,7 @@ class VerletIntel : public Integrate {
 
  protected:
   int triclinic;                    // 0 if domain is orthog, 1 if triclinic
-  int torqueflag,erforceflag;
-  int e_flag,rho_flag;
+  int torqueflag,extraflag;
 
   virtual void force_clear();
   #ifdef _LMP_INTEL_OFFLOAD
diff --git a/src/USER-INTEL/verlet_split_intel.cpp b/src/USER-INTEL/verlet_split_intel.cpp
index 3976607b18..2eab0078d7 100644
--- a/src/USER-INTEL/verlet_split_intel.cpp
+++ b/src/USER-INTEL/verlet_split_intel.cpp
@@ -52,6 +52,9 @@ VerletSplitIntel::VerletSplitIntel(LAMMPS *lmp, int narg, char **arg) :
   if (universe->procs_per_world[0] % universe->procs_per_world[1])
     error->universe_all(FLERR,"Verlet/split requires Rspace partition "
                         "size be multiple of Kspace partition size");
+  if (comm->style != 0) 
+    error->universe_all(FLERR,"Verlet/split can only currently be used with "
+                        "comm_style brick");
 
   // master = 1 for Rspace procs, 0 for Kspace procs
 
@@ -214,6 +217,9 @@ VerletSplitIntel::~VerletSplitIntel()
 
 void VerletSplitIntel::init()
 {
+  if (comm->style != 0) 
+    error->universe_all(FLERR,"Verlet/split can only currently be used with "
+                        "comm_style brick");
   if (!force->kspace && comm->me == 0)
     error->warning(FLERR,"No Kspace calculation with verlet/split");
 
diff --git a/src/USER-MISC/pair_list.cpp b/src/USER-MISC/pair_list.cpp
index 880cab6c31..2895b8216b 100644
--- a/src/USER-MISC/pair_list.cpp
+++ b/src/USER-MISC/pair_list.cpp
@@ -48,11 +48,6 @@ static double mypow(double x, int n) {
 }
 
 typedef struct { double x,y,z; } dbl3_t;
-#if defined(__GNUC__)
-#define _noalias __restrict
-#else
-#define _noalias
-#endif
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/USER-OMP/domain_omp.cpp b/src/USER-OMP/domain_omp.cpp
index d5468863cc..584e56b1f0 100644
--- a/src/USER-OMP/domain_omp.cpp
+++ b/src/USER-OMP/domain_omp.cpp
@@ -22,11 +22,6 @@
 using namespace LAMMPS_NS;
 
 typedef struct { double x,y,z; } dbl3_t;
-#if defined(__GNUC__)
-#define _noalias __restrict
-#else
-#define _noalias
-#endif
 
 /* ----------------------------------------------------------------------
    enforce PBC and modify box image flags for each atom
diff --git a/src/USER-OMP/fix_nh_asphere_omp.cpp b/src/USER-OMP/fix_nh_asphere_omp.cpp
index 28a4079ead..4286140a43 100644
--- a/src/USER-OMP/fix_nh_asphere_omp.cpp
+++ b/src/USER-OMP/fix_nh_asphere_omp.cpp
@@ -33,11 +33,6 @@ using namespace FixConst;
 enum{NOBIAS,BIAS};
 
 typedef struct { double x,y,z; } dbl3_t;
-#if defined(__GNUC__)
-#define _noalias __restrict
-#else
-#define _noalias
-#endif
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/USER-OMP/fix_nh_omp.cpp b/src/USER-OMP/fix_nh_omp.cpp
index 4bcde85e03..591503d01e 100644
--- a/src/USER-OMP/fix_nh_omp.cpp
+++ b/src/USER-OMP/fix_nh_omp.cpp
@@ -34,11 +34,6 @@ enum{ISO,ANISO,TRICLINIC};
 #define TILTMAX 1.5
 
 typedef struct { double x,y,z; } dbl3_t;
-#if defined(__GNUC__)
-#define _noalias __restrict
-#else
-#define _noalias
-#endif
 
 /* ----------------------------------------------------------------------
    change box size
diff --git a/src/USER-OMP/fix_nh_sphere_omp.cpp b/src/USER-OMP/fix_nh_sphere_omp.cpp
index a180e7270f..79d98a6cca 100644
--- a/src/USER-OMP/fix_nh_sphere_omp.cpp
+++ b/src/USER-OMP/fix_nh_sphere_omp.cpp
@@ -31,11 +31,6 @@ enum{NOBIAS,BIAS};
 #define INERTIA 0.4          // moment of inertia prefactor for sphere
 
 typedef struct { double x,y,z; } dbl3_t;
-#if defined(__GNUC__)
-#define _noalias __restrict
-#else
-#define _noalias
-#endif
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/USER-OMP/fix_nve_omp.cpp b/src/USER-OMP/fix_nve_omp.cpp
index dd0ae12a5c..9e132aa539 100644
--- a/src/USER-OMP/fix_nve_omp.cpp
+++ b/src/USER-OMP/fix_nve_omp.cpp
@@ -19,11 +19,6 @@ using namespace LAMMPS_NS;
 using namespace FixConst;
 
 typedef struct { double x,y,z; } dbl3_t;
-#if defined(__GNUC__)
-#define _noalias __restrict
-#else
-#define _noalias
-#endif
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/USER-OMP/fix_nvt_sllod_omp.cpp b/src/USER-OMP/fix_nvt_sllod_omp.cpp
index a3d1c5b04d..0ce946e769 100644
--- a/src/USER-OMP/fix_nvt_sllod_omp.cpp
+++ b/src/USER-OMP/fix_nvt_sllod_omp.cpp
@@ -34,11 +34,6 @@ using namespace FixConst;
 enum{NO_REMAP,X_REMAP,V_REMAP};                   // same as fix_deform.cpp
 
 typedef struct { double x,y,z; } dbl3_t;
-#if defined(__GNUC__)
-#define _noalias __restrict
-#else
-#define _noalias
-#endif
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/USER-OMP/fix_rigid_nh_omp.cpp b/src/USER-OMP/fix_rigid_nh_omp.cpp
index 022a09d6a1..1b16657c9c 100644
--- a/src/USER-OMP/fix_rigid_nh_omp.cpp
+++ b/src/USER-OMP/fix_rigid_nh_omp.cpp
@@ -48,11 +48,6 @@ enum{ISO,ANISO,TRICLINIC};	// same as in FixRigid
 #define EINERTIA 0.4            // moment of inertia prefactor for ellipsoid
 
 typedef struct { double x,y,z; } dbl3_t;
-#if defined(__GNUC__)
-#define _noalias __restrict
-#else
-#define _noalias
-#endif
 
 /* ----------------------------------------------------------------------
    perform preforce velocity Verlet integration
diff --git a/src/USER-OMP/fix_rigid_omp.cpp b/src/USER-OMP/fix_rigid_omp.cpp
index 7ffa741185..f3b16a5ecc 100644
--- a/src/USER-OMP/fix_rigid_omp.cpp
+++ b/src/USER-OMP/fix_rigid_omp.cpp
@@ -42,11 +42,6 @@ enum{SINGLE,MOLECULE,GROUP};	// same as in FixRigid
 #define EINERTIA 0.4            // moment of inertia prefactor for ellipsoid
 
 typedef struct { double x,y,z; } dbl3_t;
-#if defined(__GNUC__)
-#define _noalias __restrict
-#else
-#define _noalias
-#endif
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/USER-OMP/fix_rigid_small_omp.cpp b/src/USER-OMP/fix_rigid_small_omp.cpp
index 292faba919..c6b986da2d 100644
--- a/src/USER-OMP/fix_rigid_small_omp.cpp
+++ b/src/USER-OMP/fix_rigid_small_omp.cpp
@@ -42,11 +42,6 @@ using namespace MathConst;
 enum{FULL_BODY,INITIAL,FINAL,FORCE_TORQUE,VCM_ANGMOM,XCM_MASS,ITENSOR,DOF};
 
 typedef struct { double x,y,z; } dbl3_t;
-#if defined(__GNUC__)
-#define _noalias __restrict
-#else
-#define _noalias
-#endif
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/USER-OMP/thr_omp.h b/src/USER-OMP/thr_omp.h
index 9348058d87..c5c6094ee6 100644
--- a/src/USER-OMP/thr_omp.h
+++ b/src/USER-OMP/thr_omp.h
@@ -195,14 +195,4 @@ typedef struct { int a,b,c,d,t;  } int5_t;
 
 }
 
-#ifdef _noalias
-#undef _noalias
-#endif
-
-#if defined(__GNUC__)
-#define _noalias __restrict
-#else
-#define _noalias
-#endif
-
 #endif
diff --git a/src/lmptype.h b/src/lmptype.h
index a03c8601ad..87b7da51ed 100644
--- a/src/lmptype.h
+++ b/src/lmptype.h
@@ -167,6 +167,36 @@ typedef int bigint;
 
 }
 
+// preprocessor macros for compiler specific settings
+// clear previous definitions to avoid redefinition warning
+
+#ifdef _alignvar
+#undef _alignvar
+#endif
+#ifdef _noalias
+#undef _noalias
+#endif
+
+// define stack variable alignment
+
+#if defined(__INTEL_COMPILER)
+#define _alignvar(expr,val) __declspec(align(val)) expr
+#elif defined(__GNUC__)
+#define _alignvar(expr,val) expr __attribute((aligned(val)))
+#else
+#define _alignvar(expr,val) expr
+#endif
+
+// declaration to lift aliasing restrictions
+
+#if defined(__INTEL_COMPILER)
+#define _noalias restrict
+#elif defined(__GNUC__)
+#define _noalias __restrict
+#else
+#define _noalias
+#endif
+
 // settings to enable LAMMPS to build under Windows
 
 #ifdef _WIN32
diff --git a/src/lmpwindows.h b/src/lmpwindows.h
index 50f64eafb2..419aa9aff5 100644
--- a/src/lmpwindows.h
+++ b/src/lmpwindows.h
@@ -15,7 +15,6 @@
 #define ATOBIGINT _atoi64
 
 #define pclose _pclose
-#define __restrict__ __restrict
 
 // the following functions ared defined to get rid of
 // 'ambiguous call to overloaded function' error in VSS for mismathched type arguments