memory allocation bugfix for USER-INTEL pppm from M Brown

2017-07-03 08:53:53 -06:00 · 2017-07-03 08:53:53 -06:00 · e634c5a2de
parent 883b7aaa0e
commit e634c5a2de
8 changed files with 239 additions and 112 deletions
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@ -106,6 +106,8 @@ $t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l
 For some of the simple 2-body potentials without long-range
 electrostatics, performance and scalability can be better with
 the "newton off" setting added to the input script :l
+For simulations on higher node counts, add "processors * * * grid 
+numa" to the beginning of the input script for better scalability :l
 If using {kspace_style pppm} in the input script, add
 "kspace_modify diff ad" for better performance :l
 :ule
@ -392,6 +394,10 @@ hybrid intel omp"_suffix.html command can also be used within the
 input script to automatically append the "omp" suffix to styles when
 USER-INTEL styles are not available.

+NOTE: For simulations on higher node counts, add "processors * * * 
+grid numa"_processors.html" to the beginning of the input script for
+better scalability.
+
 When running on many nodes, performance might be better when using
 fewer OpenMP threads and more MPI tasks. This will depend on the
 simulation and the machine. Using the "verlet/split"_run_style.html
--- a/doc/src/read_data.txt
+++ b/doc/src/read_data.txt
@ -14,7 +14,7 @@ read_data file keyword args ... :pre

 file = name of data file to read in :ulb,l
 zero or more keyword/arg pairs may be appended :l
-keyword = {add} or {offset} or {shift} or {extra/atom/types} or {extra/bond/types} or {extra/angle/types} or {extra/dihedral/types} or {extra/improper/types} or {group} or {nocoeff} or {fix} :l
+keyword = {add} or {offset} or {shift} or {extra/atom/types} or {extra/bond/types} or {extra/angle/types} or {extra/dihedral/types} or {extra/improper/types} or {extra/bond/per/atom} or {extra/angle/per/atom} or {extra/dihedral/per/atom} or {extra/improper/per/atom} or {group} or {nocoeff} or {fix} :l
  {add} arg = {append} or {Nstart} or {merge}
    append = add new atoms with IDs appended to current IDs
    Nstart = add new atoms with IDs starting with Nstart
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@ -748,7 +748,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
      if (eatom) {
        double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
        #if defined(LMP_SIMD_COMPILER)
-        #pragma novector
+        #pragma vector aligned
+	#pragma ivdep
        #endif
        for (int i = ifrom; i < ito; i++) {
          f[i].x += f_in[ii].x;
@ -762,7 +763,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
-        #pragma novector
+        #pragma vector aligned
+	#pragma ivdep
        #endif
        for (int i = ifrom; i < ito; i++) {
          f[i].x += f_in[ii].x;
@ -778,7 +780,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
      if (eatom) {
        double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
        #if defined(LMP_SIMD_COMPILER)
-        #pragma novector
+        #pragma vector aligned
+	#pragma ivdep
        #endif
        for (int i = ifrom; i < ito; i++) {
          f[i].x += f_in[i].x;
@ -788,7 +791,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
        }
      } else {
        #if defined(LMP_SIMD_COMPILER)
-        #pragma novector
+        #pragma vector aligned
+	#pragma ivdep
        #endif
        for (int i = ifrom; i < ito; i++) {
          f[i].x += f_in[i].x;
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@ -172,6 +172,10 @@ class IntelBuffers {

  inline void thr_pack(const int ifrom, const int ito, const int ago) {
    if (ago == 0) {
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
      for (int i = ifrom; i < ito; i++) {
        _x[i].x = lmp->atom->x[i][0];
        _x[i].y = lmp->atom->x[i][1];
@ -179,9 +183,17 @@ class IntelBuffers {
        _x[i].w = lmp->atom->type[i];
      }
      if (lmp->atom->q != NULL)
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
        for (int i = ifrom; i < ito; i++)
          _q[i] = lmp->atom->q[i];
    } else {
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
      for (int i = ifrom; i < ito; i++) {
        _x[i].x = lmp->atom->x[i][0];
        _x[i].y = lmp->atom->x[i][1];
@ -204,7 +216,10 @@ class IntelBuffers {
                           const int offset, const bool dotype = false) {
    double ** x = lmp->atom->x + offset;
    if (dotype == false) {
-      #pragma vector nontemporal
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
      for (int i = ifrom; i < ito; i++) {
        _x[i].x = x[i][0];
        _x[i].y = x[i][1];
@ -212,7 +227,10 @@ class IntelBuffers {
      }
    } else {
      int *type = lmp->atom->type + offset;
-      #pragma vector nontemporal
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
      for (int i = ifrom; i < ito; i++) {
        _x[i].x = x[i][0];
        _x[i].y = x[i][1];
@ -225,6 +243,10 @@ class IntelBuffers {
  inline void thr_pack_host(const int ifrom, const int ito,
                            const int offset) {
    double ** x = lmp->atom->x + offset;
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma vector aligned
+    #pragma ivdep
+    #endif
    for (int i = ifrom; i < ito; i++) {
      _host_x[i].x = x[i][0];
      _host_x[i].y = x[i][1];
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@ -68,7 +68,7 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #define INTEL_MAX_STENCIL 256
 // INTEL_MAX_STENCIL * sqrt(INTEL_MAX_STENCIL)
 #define INTEL_MAX_STENCIL_CHECK 4096
-#define INTEL_P3M_MAXORDER 7
+#define INTEL_P3M_MAXORDER 8
 #define INTEL_P3M_ALIGNED_MAXORDER 8
 // PRECOMPUTE VALUES IN TABLE (DOESN'T AFFECT ACCURACY)
 #define INTEL_P3M_TABLE 1
@ -248,6 +248,12 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,

 #else

+#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)       \
+  {                                                             \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
+  }
+
 #define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)    \
  {                                                             \
    tid = 0;                                                    \
@ -293,6 +299,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
    ito = inum;                                                 \
  }

+#define IP_PRE_omp_range_id_vec(ifrom, ip, ito, tid, inum,      \
+                                nthreads, vecsize)              \
+  {                                                             \
+    tid = 0;                                                    \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
+    ip = vecsize;                                               \
+  }
+
 #endif

 #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
--- a/src/USER-INTEL/pppm_disp_intel.cpp
+++ b/src/USER-INTEL/pppm_disp_intel.cpp
@ -885,21 +885,22 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> *buffers)
      FFT_SCALAR z0 = fdelvolinv * q[i];

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order; n++) {
        int mz = n*nix*niy + nzsum;
        FFT_SCALAR y0 = z0*rho[2][n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order; m++) {
          int mzy = m*nix + mz;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mzyx = l + mzy;
            my_density[mzyx] += x0*rho[0][l];
          }
@ -1034,21 +1035,22 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> *buffers)
      FFT_SCALAR z0 = fdelvolinv * B[type];

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order_6; n++) {
        int mz = n*nix*niy + nzsum;
        FFT_SCALAR y0 = z0*rho[2][n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order_6; m++) {
          int mzy = m*nix + mz;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mzyx = l + mzy;
            my_density[mzyx] += x0*rho[0][l];
          }
@ -1181,21 +1183,22 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> *buffers)
      FFT_SCALAR z0 = fdelvolinv;

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order_6; n++) {
        int mz = n + nzsum;
        FFT_SCALAR y0 = z0*rho[2][n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order_6; m++) {
          int my = m + nysum;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mx = l + nxsum;
            FFT_SCALAR w = x0*rho[0][l];
            density_brick_a0[mz][my][mx] += w*B[7*type];
@ -1314,21 +1317,22 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> *buffers)
      FFT_SCALAR z0 = fdelvolinv;

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order_6; n++) {
        int mz = n*nix*niy + nzsum;
        FFT_SCALAR y0 = z0*rho[2][n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order_6; m++) {
          int mzy = m*nix + mz;
          FFT_SCALAR x0 = y0*rho[1][m];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mzyx = l + mzy;
            FFT_SCALAR w0 = x0*rho[0][l];
            for(int k = 0; k < nsplit; k++)
@ -1462,21 +1466,22 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
      _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order; n++) {
        int mz = n+nzsum;
        FFT_SCALAR z0 = rho2[n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order; m++) {
          int my = m+nysum;
          FFT_SCALAR y0 = z0*rho1[m];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mx = l+nxsum;
            FFT_SCALAR x0 = y0*rho0[l];
              ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
@ -1490,8 +1495,7 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
      FFT_SCALAR ekx, eky, ekz;
      ekx = eky = ekz = ZEROF;

-
-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+      for (int l = 0; l < order; l++) {
        ekx += ekx_arr[l];
 	eky += eky_arr[l];
 	ekz += ekz_arr[l];
@ -1643,12 +1647,12 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)

      particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order; n++) {
        int mz = n + nzsum;
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order; m++) {
          int my = m + nysum;
@ -1656,9 +1660,10 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mx = l + nxsum;
            ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
            eky[l] +=  rho[0][l] * eky_p * u_brick[mz][my][mx];
@ -1668,9 +1673,9 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
      }

      #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      for (int l = 0; l < order; l++){
        particle_ekx[i] += ekx[l];
        particle_eky[i] += eky[l];
        particle_ekz[i] += ekz[l];
@ -1809,21 +1814,22 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
      _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order_6; n++) {
        int mz = n+nzsum;
        FFT_SCALAR z0 = rho2[n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order_6; m++) {
          int my = m+nysum;
          FFT_SCALAR y0 = z0*rho1[m];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mx = l+nxsum;
            FFT_SCALAR x0 = y0*rho0[l];
              ekx_arr[l] -= x0*vdx_brick_g[mz][my][mx];
@ -1837,8 +1843,7 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
      FFT_SCALAR ekx, eky, ekz;
      ekx = eky = ekz = ZEROF;

-
-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+      for (int l = 0; l < order; l++) {
        ekx += ekx_arr[l];
 	eky += eky_arr[l];
 	ekz += ekz_arr[l];
@ -1985,12 +1990,12 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)

      particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order_6; n++) {
        int mz = n + nzsum;
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order_6; m++) {
          int my = m + nysum;
@ -1998,9 +2003,10 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mx = l + nxsum;
            ekx[l] += drho[0][l] * ekx_p * u_brick_g[mz][my][mx];
            eky[l] +=  rho[0][l] * eky_p * u_brick_g[mz][my][mx];
@ -2010,9 +2016,9 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
      }

      #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      for (int l = 0; l < order; l++){
        particle_ekx[i] += ekx[l];
        particle_eky[i] += eky[l];
        particle_ekz[i] += ekz[l];
@ -2168,21 +2174,22 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)


      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order_6; n++) {
        int mz = n+nzsum;
        FFT_SCALAR z0 = rho2[n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order_6; m++) {
          int my = m+nysum;
          FFT_SCALAR y0 = z0*rho1[m];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mx = l+nxsum;
            FFT_SCALAR x0 = y0*rho0[l];
              ekx0_arr[l] -= x0*vdx_brick_a0[mz][my][mx];
@ -2221,7 +2228,7 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)
      ekx5 = eky5 = ekz5 = ZEROF;
      ekx6 = eky6 = ekz6 = ZEROF;

-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+      for (int l = 0; l < order; l++) {
 	ekx0 += ekx0_arr[l];
 	eky0 += eky0_arr[l];
 	ekz0 += ekz0_arr[l];
@ -2439,12 +2446,12 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
      particle_ekx6[i] = particle_eky6[i] = particle_ekz6[i] = ZEROF;

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order_6; n++) {
        int mz = n + nzsum;
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order_6; m++) {
          int my = m + nysum;
@ -2452,9 +2459,10 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #pragma simd
          #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
            int mx = l + nxsum;
            FFT_SCALAR x0 = drho[0][l] * ekx_p;
            FFT_SCALAR y0 = rho[0][l] * eky_p;
@ -2486,9 +2494,9 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
      }

      #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      for (int l = 0; l < order; l++){
        particle_ekx0[i] += ekx0[l];
        particle_eky0[i] += eky0[l];
        particle_ekz0[i] += ekz0[l];
@ -2681,21 +2689,22 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)

      for (int k = 0; k < nsplit; k++) {
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int n = 0; n < order_6; n++) {
          int mz = n+nzsum;
          FFT_SCALAR z0 = rho2[n];
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count=7
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int m = 0; m < order_6; m++) {
            int my = m+nysum;
            FFT_SCALAR y0 = z0*rho1[m];
            #if defined(LMP_SIMD_COMPILER)
+            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
            #pragma simd
            #endif
-            for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            for (int l = 0; l < order; l++) {
              int mx = l+nxsum;
              FFT_SCALAR x0 = y0*rho0[l];
              ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
@ -2716,7 +2725,7 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)
        ekx[k] = eky[k] = ekz[k] = ZEROF;
      }

-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+      for (int l = 0; l < order; l++) {
 	for (int k = 0; k < nsplit; k++) {
 	  ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
 	  eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
@ -2867,12 +2876,12 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
      for (int k = 0; k < nsplit; k++) {
        particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int n = 0; n < order_6; n++) {
          int mz = n + nzsum;
          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count=7
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
          #endif
          for (int m = 0; m < order_6; m++) {
            int my = m + nysum;
@ -2880,9 +2889,10 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
            FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
            FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
            #if defined(LMP_SIMD_COMPILER)
+            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
            #pragma simd
            #endif
-            for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            for (int l = 0; l < order; l++) {
              int mx = l + nxsum;
              ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l] += drho[0][l] * ekx_p *
                u_brick_none[k][mz][my][mx];
@ -2903,9 +2913,9 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
      }

      #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      for (int l = 0; l < order; l++){
        for (int k = 0; k < nsplit; k++) {
          ekx_tot[k] += ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l];
          eky_tot[k] += eky[k*INTEL_P3M_ALIGNED_MAXORDER+l];
--- a/src/USER-INTEL/pppm_intel.cpp
+++ b/src/USER-INTEL/pppm_intel.cpp
@ -149,11 +149,11 @@ void PPPMIntel::init()
    memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
    memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
    memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
-    memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
+    create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
 		    nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
 		    "pppmintel:vdxy_brick");
    memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
-    memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
+    create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
 		    nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
 		    "pppmintel:vdz0_brick");
    memory->destroy(work3);
@ -555,13 +555,13 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
      FFT_SCALAR z0 = fdelvolinv * q[i];

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order; n++) {
        int mz = n*nix*niy + nzsum;
        FFT_SCALAR y0 = z0*rho[2][n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order; m++) {
          int mzy = m*nix + mz;
@ -708,13 +708,13 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
      _alignvar(FFT_SCALAR ekz0_arr[2 * INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order; n++) {
        int mz = n+nzsum;
        FFT_SCALAR z0 = rho2[n];
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order; m++) {
          int my = m+nysum;
@ -742,13 +742,13 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
      ekx = eky = ekz = ZEROF;

      if (use_packing) {
-        for (int l = 0; l < 2*INTEL_P3M_ALIGNED_MAXORDER; l += 2) {
+        for (int l = 0; l < 2*order; l += 2) {
          ekx += ekxy_arr[l];
          eky += ekxy_arr[l+1];
          ekz += ekz0_arr[l];
        }
      } else {
-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+        for (int l = 0; l < order; l++) {
          ekx += ekx_arr[l];
          eky += eky_arr[l];
          ekz += ekz_arr[l];
@ -896,12 +896,12 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
      particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;

      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
      for (int n = 0; n < order; n++) {
        int mz = n + nzsum;
        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
        #endif
        for (int m = 0; m < order; m++) {
          int my = m + nysum;
@ -921,9 +921,9 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
      }

      #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
      #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      for (int l = 0; l < order; l++){
        particle_ekx[i] += ekx[l];
        particle_eky[i] += eky[l];
        particle_ekz[i] += ekz[l];
@ -1240,6 +1240,73 @@ void PPPMIntel::pack_buffers()
  fix->stop_watch(TIME_PACK);
 }

+/* ----------------------------------------------------------------------
+   Allocate density_brick with extra padding for vector writes
+------------------------------------------------------------------------- */
+
+void PPPMIntel::allocate()
+{
+  PPPM::allocate();
+  memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
+  create3d_offset(density_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+		  nxlo_out,nxhi_out,"pppm:density_brick");
+
+  if (differentiation_flag == 1) {
+    memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out);
+    create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+	            nxlo_out,nxhi_out,"pppm:u_brick");
+  } else {
+    memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
+    create3d_offset(vdx_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+	            nxlo_out,nxhi_out,"pppm:vdx_brick");
+    create3d_offset(vdy_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+	            nxlo_out,nxhi_out,"pppm:vdy_brick");
+    create3d_offset(vdz_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+	            nxlo_out,nxhi_out,"pppm:vdz_brick");
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Create 3D-offset allocation with extra padding for vector writes
+------------------------------------------------------------------------- */
+
+FFT_SCALAR *** PPPMIntel::create3d_offset(FFT_SCALAR ***&array, int n1lo, 
+	                                  int n1hi, int n2lo, int n2hi, 
+	                                  int n3lo, int n3hi,
+	                                  const char *name)
+{
+  int n1 = n1hi - n1lo + 1;
+  int n2 = n2hi - n2lo + 1;
+  int n3 = n3hi - n3lo + 1;
+
+  bigint nbytes = ((bigint) sizeof(FFT_SCALAR)) * n1*n2*n3 + 
+    INTEL_P3M_ALIGNED_MAXORDER*2;
+  FFT_SCALAR *data = (FFT_SCALAR *) memory->smalloc(nbytes,name);
+  nbytes = ((bigint) sizeof(FFT_SCALAR *)) * n1*n2;
+  FFT_SCALAR **plane = (FFT_SCALAR **) memory->smalloc(nbytes,name);
+  nbytes = ((bigint) sizeof(FFT_SCALAR **)) * n1;
+  array = (FFT_SCALAR ***) memory->smalloc(nbytes,name);
+
+  bigint m;
+  bigint n = 0;
+  for (int i = 0; i < n1; i++) {
+    m = ((bigint) i) * n2;
+    array[i] = &plane[m];
+    for (int j = 0; j < n2; j++) {
+      plane[m+j] = &data[n];
+      n += n3;
+    }
+  }
+
+  m = ((bigint) n1) * n2;
+  for (bigint i = 0; i < m; i++) array[0][i] -= n3lo;
+  for (int i = 0; i < n1; i++) array[i] -= n2lo;
+  array -= n1lo;
+  return array;
+}
+
 /* ----------------------------------------------------------------------
   Returns 0 if Intel optimizations for PPPM ignored due to offload
 ------------------------------------------------------------------------- */
--- a/src/USER-INTEL/pppm_intel.h
+++ b/src/USER-INTEL/pppm_intel.h
@ -74,10 +74,11 @@ class PPPMIntel : public PPPM {
  int _use_base;
  #endif

+  virtual void allocate();
+
  template<class flt_t, class acc_t>
  void test_function(IntelBuffers<flt_t,acc_t> *buffers);

-
  void precompute_rho();
  template<class flt_t, class acc_t>
  void particle_map(IntelBuffers<flt_t,acc_t> *buffers);
@ -120,6 +121,8 @@ class PPPMIntel : public PPPM {
      fieldforce_ad<flt_t,acc_t,0>(buffers);
    }
  }
+  FFT_SCALAR ***create3d_offset(FFT_SCALAR ***&, int, int, int,
+				int, int, int, const char *name);
 };

 }