Merge pull request #558 from lammps/intel

memory allocation bugfix for USER-INTEL pppm from M Brown
2017-07-03 12:25:12 -06:00 · 2017-07-03 12:25:12 -06:00 · 5c0c8bb4cd
parent 9eeb97b039 9ca9b5e2ff
commit 5c0c8bb4cd
11 changed files with 232 additions and 116 deletions
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -2,14 +2,18 @@

 _Briefly describe the new feature(s), enhancement(s), or bugfix(es) included in this pull request. If this addresses an open GitHub Issue, mention the issue number, e.g. with `fixes #221` or `closes #135`, so that issue will be automatically closed when the pull request is merged_

-## Implementation Notes
+## Author(s)

-_Provide any relevant details about how the changes are implemented, how correctness was verified, how other features - if any - in LAMMPS are affected_
+_Please state name and affiliation of the author or authors that should be credited with the changes in this pull request_

 ## Backward Compatibility

 _Please state whether any changes in the pull request break backward compatibility for inputs, and - if yes - explain what has been changed and why_

+## Implementation Notes
+
+_Provide any relevant details about how the changes are implemented, how correctness was verified, how other features - if any - in LAMMPS are affected_
+
 ## Post Submission Checklist

 _Please check the fields below as they are completed_
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@ -106,6 +106,8 @@ $t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l
 For some of the simple 2-body potentials without long-range
 electrostatics, performance and scalability can be better with
 the "newton off" setting added to the input script :l
+For simulations on higher node counts, add "processors * * * grid 
+numa" to the beginning of the input script for better scalability :l
 If using {kspace_style pppm} in the input script, add
 "kspace_modify diff ad" for better performance :l
 :ule
@ -392,6 +394,10 @@ hybrid intel omp"_suffix.html command can also be used within the
 input script to automatically append the "omp" suffix to styles when
 USER-INTEL styles are not available.

+NOTE: For simulations on higher node counts, add "processors * * * 
+grid numa"_processors.html" to the beginning of the input script for
+better scalability.
+
 When running on many nodes, performance might be better when using
 fewer OpenMP threads and more MPI tasks. This will depend on the
 simulation and the machine. Using the "verlet/split"_run_style.html
--- a/doc/src/read_data.txt
+++ b/doc/src/read_data.txt
@ -14,7 +14,7 @@ read_data file keyword args ... :pre

 file = name of data file to read in :ulb,l
 zero or more keyword/arg pairs may be appended :l
-keyword = {add} or {offset} or {shift} or {extra/atom/types} or {extra/bond/types} or {extra/angle/types} or {extra/dihedral/types} or {extra/improper/types} or {group} or {nocoeff} or {fix} :l
+keyword = {add} or {offset} or {shift} or {extra/atom/types} or {extra/bond/types} or {extra/angle/types} or {extra/dihedral/types} or {extra/improper/types} or {extra/bond/per/atom} or {extra/angle/per/atom} or {extra/dihedral/per/atom} or {extra/improper/per/atom} or {group} or {nocoeff} or {fix} :l
  {add} arg = {append} or {Nstart} or {merge}
    append = add new atoms with IDs appended to current IDs
    Nstart = add new atoms with IDs starting with Nstart
--- a/examples/balance/in.balance.bond.fast
+++ b/examples/balance/in.balance.bond.fast
@ -32,7 +32,7 @@ bond_coeff      1 10.0 1.2
 # need to preserve 1-3, 1-4 pairwise interactions during hard collisions

 special_bonds   lj/coul 0 1 1
-create_bonds    all all 1 1.0 1.5
+create_bonds    many all all 1 1.0 1.5

 neighbor	0.3 bin
 neigh_modify	delay 0 every 1 check yes
--- a/examples/balance/in.balance.bond.slow
+++ b/examples/balance/in.balance.bond.slow
@ -32,7 +32,7 @@ pair_coeff	1 1 10.0 1.0 2.5
 bond_style      harmonic
 bond_coeff      1 10.0 1.2

-create_bonds    all all 1 1.0 1.5
+create_bonds    many all all 1 1.0 1.5

 neighbor	0.3 bin
 neigh_modify	delay 0 every 1 check yes
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@ -748,7 +748,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
      if (eatom) {
        double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
        #if defined(LMP_SIMD_COMPILER)
-        #pragma novector
+        #pragma vector aligned
+	#pragma ivdep
        #endif
        for (int i = ifrom; i < ito; i++) {
          f[i].x += f_in[ii].x;
@ -762,7 +763,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
-        #pragma novector
+        #pragma vector aligned
+	#pragma ivdep
        #endif
        for (int i = ifrom; i < ito; i++) {
          f[i].x += f_in[ii].x;
@ -778,7 +780,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
      if (eatom) {
        double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
        #if defined(LMP_SIMD_COMPILER)
-        #pragma novector
+        #pragma vector aligned
+	#pragma ivdep
        #endif
        for (int i = ifrom; i < ito; i++) {
          f[i].x += f_in[i].x;
@ -788,7 +791,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
-        #pragma novector
+        #pragma vector aligned
+	#pragma ivdep
        #endif
        for (int i = ifrom; i < ito; i++) {
          f[i].x += f_in[i].x;
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@ -172,6 +172,10 @@ class IntelBuffers {

  inline void thr_pack(const int ifrom, const int ito, const int ago) {
    if (ago == 0) {
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
      for (int i = ifrom; i < ito; i++) {
        _x[i].x = lmp->atom->x[i][0];
        _x[i].y = lmp->atom->x[i][1];
@ -179,9 +183,17 @@ class IntelBuffers {
        _x[i].w = lmp->atom->type[i];
      }
      if (lmp->atom->q != NULL)
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
        for (int i = ifrom; i < ito; i++)
          _q[i] = lmp->atom->q[i];
    } else {
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
      for (int i = ifrom; i < ito; i++) {
        _x[i].x = lmp->atom->x[i][0];
        _x[i].y = lmp->atom->x[i][1];
@ -204,7 +216,10 @@ class IntelBuffers {
                           const int offset, const bool dotype = false) {
    double ** x = lmp->atom->x + offset;
    if (dotype == false) {
-      #pragma vector nontemporal
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
      for (int i = ifrom; i < ito; i++) {
        _x[i].x = x[i][0];
        _x[i].y = x[i][1];
@ -212,7 +227,10 @@ class IntelBuffers {
      }
    } else {
      int *type = lmp->atom->type + offset;
-      #pragma vector nontemporal
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
      for (int i = ifrom; i < ito; i++) {
        _x[i].x = x[i][0];
        _x[i].y = x[i][1];
@ -225,6 +243,10 @@ class IntelBuffers {
  inline void thr_pack_host(const int ifrom, const int ito,
                            const int offset) {
    double ** x = lmp->atom->x + offset;
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma vector aligned
+    #pragma ivdep
+    #endif
    for (int i = ifrom; i < ito; i++) {
      _host_x[i].x = x[i][0];
      _host_x[i].y = x[i][1];
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@ -68,7 +68,7 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #define INTEL_MAX_STENCIL 256
 // INTEL_MAX_STENCIL * sqrt(INTEL_MAX_STENCIL)
 #define INTEL_MAX_STENCIL_CHECK 4096
-#define INTEL_P3M_MAXORDER 7
+#define INTEL_P3M_MAXORDER 8
 #define INTEL_P3M_ALIGNED_MAXORDER 8
 // PRECOMPUTE VALUES IN TABLE (DOESN'T AFFECT ACCURACY)
 #define INTEL_P3M_TABLE 1
--- a/src/USER-INTEL/pppm_disp_intel.cpp
+++ b/src/USER-INTEL/pppm_disp_intel.cpp
@ -885,21 +885,22 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> *buffers)
      FFT_SCALAR z0 = fdelvolinv * q[i];

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order; n++) {
        int mz = n*nix*niy + nzsum;
        FFT_SCALAR y0 = z0*rho[2][n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order; m++) {
          int mzy = m*nix + mz;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mzyx = l + mzy;
            my_density[mzyx] += x0*rho[0][l];
          }
@ -1034,21 +1035,22 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> *buffers)
      FFT_SCALAR z0 = fdelvolinv * B[type];

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order_6; n++) {
        int mz = n*nix*niy + nzsum;
        FFT_SCALAR y0 = z0*rho[2][n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order_6; m++) {
          int mzy = m*nix + mz;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mzyx = l + mzy;
            my_density[mzyx] += x0*rho[0][l];
          }
@ -1181,21 +1183,22 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> *buffers)
      FFT_SCALAR z0 = fdelvolinv;

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order_6; n++) {
        int mz = n + nzsum;
        FFT_SCALAR y0 = z0*rho[2][n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order_6; m++) {
          int my = m + nysum;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mx = l + nxsum;
            FFT_SCALAR w = x0*rho[0][l];
            density_brick_a0[mz][my][mx] += w*B[7*type];
@ -1314,21 +1317,22 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> *buffers)
      FFT_SCALAR z0 = fdelvolinv;

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order_6; n++) {
        int mz = n*nix*niy + nzsum;
        FFT_SCALAR y0 = z0*rho[2][n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order_6; m++) {
          int mzy = m*nix + mz;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mzyx = l + mzy;
            FFT_SCALAR w0 = x0*rho[0][l];
            for(int k = 0; k < nsplit; k++)
@ -1462,21 +1466,22 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
      _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order; n++) {
        int mz = n+nzsum;
        FFT_SCALAR z0 = rho2[n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order; m++) {
          int my = m+nysum;
          FFT_SCALAR y0 = z0*rho1[m];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mx = l+nxsum;
            FFT_SCALAR x0 = y0*rho0[l];
              ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
@ -1490,12 +1495,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
      FFT_SCALAR ekx, eky, ekz;
      ekx = eky = ekz = ZEROF;

-
-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-          ekx += ekx_arr[l];
-          eky += eky_arr[l];
-          ekz += ekz_arr[l];
-        }
+      for (int l = 0; l < order; l++) {
+        ekx += ekx_arr[l];
+	eky += eky_arr[l];
+	ekz += ekz_arr[l];
+      }

      // convert E-field to force

@ -1643,12 +1647,12 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)

      particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order; n++) {
        int mz = n + nzsum;
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order; m++) {
          int my = m + nysum;
@ -1656,9 +1660,10 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mx = l + nxsum;
            ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
            eky[l] +=  rho[0][l] * eky_p * u_brick[mz][my][mx];
@ -1668,9 +1673,9 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
      }

      #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      for (int l = 0; l < order; l++){
        particle_ekx[i] += ekx[l];
        particle_eky[i] += eky[l];
        particle_ekz[i] += ekz[l];
@ -1809,21 +1814,22 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
      _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order_6; n++) {
        int mz = n+nzsum;
        FFT_SCALAR z0 = rho2[n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order_6; m++) {
          int my = m+nysum;
          FFT_SCALAR y0 = z0*rho1[m];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mx = l+nxsum;
            FFT_SCALAR x0 = y0*rho0[l];
              ekx_arr[l] -= x0*vdx_brick_g[mz][my][mx];
@ -1837,12 +1843,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
      FFT_SCALAR ekx, eky, ekz;
      ekx = eky = ekz = ZEROF;

-
-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-          ekx += ekx_arr[l];
-          eky += eky_arr[l];
-          ekz += ekz_arr[l];
-        }
+      for (int l = 0; l < order; l++) {
+        ekx += ekx_arr[l];
+	eky += eky_arr[l];
+	ekz += ekz_arr[l];
+      }

      // convert E-field to force

@ -1985,12 +1990,12 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)

      particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order_6; n++) {
        int mz = n + nzsum;
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order_6; m++) {
          int my = m + nysum;
@ -1998,9 +2003,10 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mx = l + nxsum;
            ekx[l] += drho[0][l] * ekx_p * u_brick_g[mz][my][mx];
            eky[l] +=  rho[0][l] * eky_p * u_brick_g[mz][my][mx];
@ -2010,9 +2016,9 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
      }

      #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      for (int l = 0; l < order; l++){
        particle_ekx[i] += ekx[l];
        particle_eky[i] += eky[l];
        particle_ekz[i] += ekz[l];
@ -2168,21 +2174,22 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)


      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order_6; n++) {
        int mz = n+nzsum;
        FFT_SCALAR z0 = rho2[n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order_6; m++) {
          int my = m+nysum;
          FFT_SCALAR y0 = z0*rho1[m];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mx = l+nxsum;
            FFT_SCALAR x0 = y0*rho0[l];
              ekx0_arr[l] -= x0*vdx_brick_a0[mz][my][mx];
@ -2221,29 +2228,29 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)
      ekx5 = eky5 = ekz5 = ZEROF;
      ekx6 = eky6 = ekz6 = ZEROF;

-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-          ekx0 += ekx0_arr[l];
-          eky0 += eky0_arr[l];
-          ekz0 += ekz0_arr[l];
-          ekx1 += ekx1_arr[l];
-          eky1 += eky1_arr[l];
-          ekz1 += ekz1_arr[l];
-          ekx2 += ekx2_arr[l];
-          eky2 += eky2_arr[l];
-          ekz2 += ekz2_arr[l];
-          ekx3 += ekx3_arr[l];
-          eky3 += eky3_arr[l];
-          ekz3 += ekz3_arr[l];
-          ekx4 += ekx4_arr[l];
-          eky4 += eky4_arr[l];
-          ekz4 += ekz4_arr[l];
-          ekx5 += ekx5_arr[l];
-          eky5 += eky5_arr[l];
-          ekz5 += ekz5_arr[l];
-          ekx6 += ekx6_arr[l];
-          eky6 += eky6_arr[l];
-          ekz6 += ekz6_arr[l];
-        }
+      for (int l = 0; l < order; l++) {
+	ekx0 += ekx0_arr[l];
+	eky0 += eky0_arr[l];
+	ekz0 += ekz0_arr[l];
+	ekx1 += ekx1_arr[l];
+	eky1 += eky1_arr[l];
+	ekz1 += ekz1_arr[l];
+	ekx2 += ekx2_arr[l];
+	eky2 += eky2_arr[l];
+	ekz2 += ekz2_arr[l];
+	ekx3 += ekx3_arr[l];
+	eky3 += eky3_arr[l];
+	ekz3 += ekz3_arr[l];
+	ekx4 += ekx4_arr[l];
+	eky4 += eky4_arr[l];
+	ekz4 += ekz4_arr[l];
+	ekx5 += ekx5_arr[l];
+	eky5 += eky5_arr[l];
+	ekz5 += ekz5_arr[l];
+	ekx6 += ekx6_arr[l];
+	eky6 += eky6_arr[l];
+	ekz6 += ekz6_arr[l];
+      }

      // convert D-field to force

@ -2439,12 +2446,12 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
      particle_ekx6[i] = particle_eky6[i] = particle_ekz6[i] = ZEROF;

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order_6; n++) {
        int mz = n + nzsum;
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order_6; m++) {
          int my = m + nysum;
@ -2452,9 +2459,10 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mx = l + nxsum;
            FFT_SCALAR x0 = drho[0][l] * ekx_p;
            FFT_SCALAR y0 = rho[0][l] * eky_p;
@ -2486,9 +2494,9 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
      }

      #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      for (int l = 0; l < order; l++){
        particle_ekx0[i] += ekx0[l];
        particle_eky0[i] += eky0[l];
        particle_ekz0[i] += ekz0[l];
@ -2681,21 +2689,22 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)

      for (int k = 0; k < nsplit; k++) {
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int n = 0; n < order_6; n++) {
          int mz = n+nzsum;
          FFT_SCALAR z0 = rho2[n];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count=7
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int m = 0; m < order_6; m++) {
            int my = m+nysum;
            FFT_SCALAR y0 = z0*rho1[m];
            #if defined(LMP_SIMD_COMPILER)
+            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
            #pragma simd
            #endif
-            for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            for (int l = 0; l < order; l++) {
              int mx = l+nxsum;
              FFT_SCALAR x0 = y0*rho0[l];
              ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
@ -2716,13 +2725,13 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)
        ekx[k] = eky[k] = ekz[k] = ZEROF;
      }

-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-          for (int k = 0; k < nsplit; k++) {
-            ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
-            eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
-            ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
-          }
-        }
+      for (int l = 0; l < order; l++) {
+	for (int k = 0; k < nsplit; k++) {
+	  ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
+	  eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
+	  ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
+	}
+      }

      // convert E-field to force

@ -2867,12 +2876,12 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
      for (int k = 0; k < nsplit; k++) {
        particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int n = 0; n < order_6; n++) {
          int mz = n + nzsum;
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count=7
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int m = 0; m < order_6; m++) {
            int my = m + nysum;
@ -2880,9 +2889,10 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
            FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
            FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
            #if defined(LMP_SIMD_COMPILER)
+            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
            #pragma simd
            #endif
-            for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            for (int l = 0; l < order; l++) {
              int mx = l + nxsum;
              ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l] += drho[0][l] * ekx_p *
                u_brick_none[k][mz][my][mx];
@ -2903,9 +2913,9 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
      }

      #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      for (int l = 0; l < order; l++){
        for (int k = 0; k < nsplit; k++) {
          ekx_tot[k] += ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l];
          eky_tot[k] += eky[k*INTEL_P3M_ALIGNED_MAXORDER+l];
--- a/src/USER-INTEL/pppm_intel.cpp
+++ b/src/USER-INTEL/pppm_intel.cpp
@ -149,13 +149,13 @@ void PPPMIntel::init()
    memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
    memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
    memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
-    memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
-                            nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
-                            "pppmintel:vdxy_brick");
+    create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
+		    nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
+		    "pppmintel:vdxy_brick");
    memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
-    memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
-                            nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
-                            "pppmintel:vdz0_brick");
+    create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
+		    nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
+		    "pppmintel:vdz0_brick");
    memory->destroy(work3);
    memory->create(work3, 2*nfft_both, "pppmintel:work3");

@ -555,13 +555,13 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
      FFT_SCALAR z0 = fdelvolinv * q[i];

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order; n++) {
        int mz = n*nix*niy + nzsum;
        FFT_SCALAR y0 = z0*rho[2][n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order; m++) {
          int mzy = m*nix + mz;
@ -708,13 +708,13 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
      _alignvar(FFT_SCALAR ekz0_arr[2 * INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order; n++) {
        int mz = n+nzsum;
        FFT_SCALAR z0 = rho2[n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order; m++) {
          int my = m+nysum;
@ -742,13 +742,13 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
      ekx = eky = ekz = ZEROF;

      if (use_packing) {
-        for (int l = 0; l < 2*INTEL_P3M_ALIGNED_MAXORDER; l += 2) {
+        for (int l = 0; l < 2*order; l += 2) {
          ekx += ekxy_arr[l];
          eky += ekxy_arr[l+1];
          ekz += ekz0_arr[l];
        }
      } else {
-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+        for (int l = 0; l < order; l++) {
          ekx += ekx_arr[l];
          eky += eky_arr[l];
          ekz += ekz_arr[l];
@ -896,12 +896,12 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
      particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order; n++) {
        int mz = n + nzsum;
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order; m++) {
          int my = m + nysum;
@ -921,9 +921,9 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
      }

      #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      for (int l = 0; l < order; l++){
        particle_ekx[i] += ekx[l];
        particle_eky[i] += eky[l];
        particle_ekz[i] += ekz[l];
@ -1240,6 +1240,73 @@ void PPPMIntel::pack_buffers()
  fix->stop_watch(TIME_PACK);
 }

+/* ----------------------------------------------------------------------
+   Allocate density_brick with extra padding for vector writes
+------------------------------------------------------------------------- */
+
+void PPPMIntel::allocate()
+{
+  PPPM::allocate();
+  memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
+  create3d_offset(density_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+		  nxlo_out,nxhi_out,"pppm:density_brick");
+
+  if (differentiation_flag == 1) {
+    memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out);
+    create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+	            nxlo_out,nxhi_out,"pppm:u_brick");
+  } else {
+    memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
+    create3d_offset(vdx_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+	            nxlo_out,nxhi_out,"pppm:vdx_brick");
+    create3d_offset(vdy_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+	            nxlo_out,nxhi_out,"pppm:vdy_brick");
+    create3d_offset(vdz_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+	            nxlo_out,nxhi_out,"pppm:vdz_brick");
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Create 3D-offset allocation with extra padding for vector writes
+------------------------------------------------------------------------- */
+
+FFT_SCALAR *** PPPMIntel::create3d_offset(FFT_SCALAR ***&array, int n1lo, 
+	                                  int n1hi, int n2lo, int n2hi, 
+	                                  int n3lo, int n3hi,
+	                                  const char *name)
+{
+  int n1 = n1hi - n1lo + 1;
+  int n2 = n2hi - n2lo + 1;
+  int n3 = n3hi - n3lo + 1;
+
+  bigint nbytes = ((bigint) sizeof(FFT_SCALAR)) * n1*n2*n3 + 
+    INTEL_P3M_ALIGNED_MAXORDER*2;
+  FFT_SCALAR *data = (FFT_SCALAR *) memory->smalloc(nbytes,name);
+  nbytes = ((bigint) sizeof(FFT_SCALAR *)) * n1*n2;
+  FFT_SCALAR **plane = (FFT_SCALAR **) memory->smalloc(nbytes,name);
+  nbytes = ((bigint) sizeof(FFT_SCALAR **)) * n1;
+  array = (FFT_SCALAR ***) memory->smalloc(nbytes,name);
+
+  bigint m;
+  bigint n = 0;
+  for (int i = 0; i < n1; i++) {
+    m = ((bigint) i) * n2;
+    array[i] = &plane[m];
+    for (int j = 0; j < n2; j++) {
+      plane[m+j] = &data[n];
+      n += n3;
+    }
+  }
+
+  m = ((bigint) n1) * n2;
+  for (bigint i = 0; i < m; i++) array[0][i] -= n3lo;
+  for (int i = 0; i < n1; i++) array[i] -= n2lo;
+  array -= n1lo;
+  return array;
+}
+
 /* ----------------------------------------------------------------------
   Returns 0 if Intel optimizations for PPPM ignored due to offload
 ------------------------------------------------------------------------- */
--- a/src/USER-INTEL/pppm_intel.h
+++ b/src/USER-INTEL/pppm_intel.h
@ -74,9 +74,10 @@ class PPPMIntel : public PPPM {
  int _use_base;
  #endif

-    template<class flt_t, class acc_t>
-    void test_function(IntelBuffers<flt_t,acc_t> *buffers);
+  virtual void allocate();

+  template<class flt_t, class acc_t>
+  void test_function(IntelBuffers<flt_t,acc_t> *buffers);

  void precompute_rho();
  template<class flt_t, class acc_t>
@ -120,6 +121,8 @@ class PPPMIntel : public PPPM {
      fieldforce_ad<flt_t,acc_t,0>(buffers);
    }
  }
+  FFT_SCALAR ***create3d_offset(FFT_SCALAR ***&, int, int, int,
+				int, int, int, const char *name);
 };

 }