From e634c5a2dec82d15b420dea0d8b272b2742956f3 Mon Sep 17 00:00:00 2001 From: Steve Plimpton Date: Mon, 3 Jul 2017 08:53:53 -0600 Subject: [PATCH] memory allocation bugfix for USER-INTEL pppm from M Brown --- doc/src/accelerate_intel.txt | 6 + doc/src/read_data.txt | 2 +- src/USER-INTEL/fix_intel.cpp | 12 +- src/USER-INTEL/intel_buffers.h | 26 ++++- src/USER-INTEL/intel_preprocess.h | 17 ++- src/USER-INTEL/pppm_disp_intel.cpp | 182 +++++++++++++++-------------- src/USER-INTEL/pppm_intel.cpp | 99 +++++++++++++--- src/USER-INTEL/pppm_intel.h | 7 +- 8 files changed, 239 insertions(+), 112 deletions(-) diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt index f5bd66aeba..155e29e367 100644 --- a/doc/src/accelerate_intel.txt +++ b/doc/src/accelerate_intel.txt @@ -106,6 +106,8 @@ $t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l For some of the simple 2-body potentials without long-range electrostatics, performance and scalability can be better with the "newton off" setting added to the input script :l +For simulations on higher node counts, add "processors * * * grid +numa" to the beginning of the input script for better scalability :l If using {kspace_style pppm} in the input script, add "kspace_modify diff ad" for better performance :l :ule @@ -392,6 +394,10 @@ hybrid intel omp"_suffix.html command can also be used within the input script to automatically append the "omp" suffix to styles when USER-INTEL styles are not available. +NOTE: For simulations on higher node counts, add "processors * * * +grid numa"_processors.html" to the beginning of the input script for +better scalability. + When running on many nodes, performance might be better when using fewer OpenMP threads and more MPI tasks. This will depend on the simulation and the machine. Using the "verlet/split"_run_style.html diff --git a/doc/src/read_data.txt b/doc/src/read_data.txt index bd602eee5a..6785eb1066 100644 --- a/doc/src/read_data.txt +++ b/doc/src/read_data.txt @@ -14,7 +14,7 @@ read_data file keyword args ... :pre file = name of data file to read in :ulb,l zero or more keyword/arg pairs may be appended :l -keyword = {add} or {offset} or {shift} or {extra/atom/types} or {extra/bond/types} or {extra/angle/types} or {extra/dihedral/types} or {extra/improper/types} or {group} or {nocoeff} or {fix} :l +keyword = {add} or {offset} or {shift} or {extra/atom/types} or {extra/bond/types} or {extra/angle/types} or {extra/dihedral/types} or {extra/improper/types} or {extra/bond/per/atom} or {extra/angle/per/atom} or {extra/dihedral/per/atom} or {extra/improper/per/atom} or {group} or {nocoeff} or {fix} :l {add} arg = {append} or {Nstart} or {merge} append = add new atoms with IDs appended to current IDs Nstart = add new atoms with IDs starting with Nstart diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp index b06f76c90d..637fc0d06e 100644 --- a/src/USER-INTEL/fix_intel.cpp +++ b/src/USER-INTEL/fix_intel.cpp @@ -748,7 +748,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in, if (eatom) { double * _noalias const lmp_eatom = force->pair->eatom + out_offset; #if defined(LMP_SIMD_COMPILER) - #pragma novector + #pragma vector aligned + #pragma ivdep #endif for (int i = ifrom; i < ito; i++) { f[i].x += f_in[ii].x; @@ -762,7 +763,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in, } } else { #if defined(LMP_SIMD_COMPILER) - #pragma novector + #pragma vector aligned + #pragma ivdep #endif for (int i = ifrom; i < ito; i++) { f[i].x += f_in[ii].x; @@ -778,7 +780,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in, if (eatom) { double * _noalias const lmp_eatom = force->pair->eatom + out_offset; #if defined(LMP_SIMD_COMPILER) - #pragma novector + #pragma vector aligned + #pragma ivdep #endif for (int i = ifrom; i < ito; i++) { f[i].x += f_in[i].x; @@ -788,7 +791,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in, } } else { #if defined(LMP_SIMD_COMPILER) - #pragma novector + #pragma vector aligned + #pragma ivdep #endif for (int i = ifrom; i < ito; i++) { f[i].x += f_in[i].x; diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h index 135309fe44..7a7640a203 100644 --- a/src/USER-INTEL/intel_buffers.h +++ b/src/USER-INTEL/intel_buffers.h @@ -172,6 +172,10 @@ class IntelBuffers { inline void thr_pack(const int ifrom, const int ito, const int ago) { if (ago == 0) { + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif for (int i = ifrom; i < ito; i++) { _x[i].x = lmp->atom->x[i][0]; _x[i].y = lmp->atom->x[i][1]; @@ -179,9 +183,17 @@ class IntelBuffers { _x[i].w = lmp->atom->type[i]; } if (lmp->atom->q != NULL) + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif for (int i = ifrom; i < ito; i++) _q[i] = lmp->atom->q[i]; } else { + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif for (int i = ifrom; i < ito; i++) { _x[i].x = lmp->atom->x[i][0]; _x[i].y = lmp->atom->x[i][1]; @@ -204,7 +216,10 @@ class IntelBuffers { const int offset, const bool dotype = false) { double ** x = lmp->atom->x + offset; if (dotype == false) { - #pragma vector nontemporal + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif for (int i = ifrom; i < ito; i++) { _x[i].x = x[i][0]; _x[i].y = x[i][1]; @@ -212,7 +227,10 @@ class IntelBuffers { } } else { int *type = lmp->atom->type + offset; - #pragma vector nontemporal + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif for (int i = ifrom; i < ito; i++) { _x[i].x = x[i][0]; _x[i].y = x[i][1]; @@ -225,6 +243,10 @@ class IntelBuffers { inline void thr_pack_host(const int ifrom, const int ito, const int offset) { double ** x = lmp->atom->x + offset; + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif for (int i = ifrom; i < ito; i++) { _host_x[i].x = x[i][0]; _host_x[i].y = x[i][1]; diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h index 1f6225dc44..e256dd1abc 100644 --- a/src/USER-INTEL/intel_preprocess.h +++ b/src/USER-INTEL/intel_preprocess.h @@ -68,7 +68,7 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, #define INTEL_MAX_STENCIL 256 // INTEL_MAX_STENCIL * sqrt(INTEL_MAX_STENCIL) #define INTEL_MAX_STENCIL_CHECK 4096 -#define INTEL_P3M_MAXORDER 7 +#define INTEL_P3M_MAXORDER 8 #define INTEL_P3M_ALIGNED_MAXORDER 8 // PRECOMPUTE VALUES IN TABLE (DOESN'T AFFECT ACCURACY) #define INTEL_P3M_TABLE 1 @@ -248,6 +248,12 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, #else +#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \ + { \ + ifrom = 0; \ + ito = inum; \ + } + #define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads) \ { \ tid = 0; \ @@ -293,6 +299,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, ito = inum; \ } +#define IP_PRE_omp_range_id_vec(ifrom, ip, ito, tid, inum, \ + nthreads, vecsize) \ + { \ + tid = 0; \ + ifrom = 0; \ + ito = inum; \ + ip = vecsize; \ + } + #endif #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \ diff --git a/src/USER-INTEL/pppm_disp_intel.cpp b/src/USER-INTEL/pppm_disp_intel.cpp index ec5f5150c2..1269579ff4 100644 --- a/src/USER-INTEL/pppm_disp_intel.cpp +++ b/src/USER-INTEL/pppm_disp_intel.cpp @@ -885,21 +885,22 @@ void PPPMDispIntel::make_rho_c(IntelBuffers *buffers) FFT_SCALAR z0 = fdelvolinv * q[i]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int n = 0; n < order; n++) { int mz = n*nix*niy + nzsum; FFT_SCALAR y0 = z0*rho[2][n]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int m = 0; m < order; m++) { int mzy = m*nix + mz; FFT_SCALAR x0 = y0*rho[1][m]; #if defined(LMP_SIMD_COMPILER) + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #pragma simd #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + for (int l = 0; l < order; l++) { int mzyx = l + mzy; my_density[mzyx] += x0*rho[0][l]; } @@ -1034,21 +1035,22 @@ void PPPMDispIntel::make_rho_g(IntelBuffers *buffers) FFT_SCALAR z0 = fdelvolinv * B[type]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int n = 0; n < order_6; n++) { int mz = n*nix*niy + nzsum; FFT_SCALAR y0 = z0*rho[2][n]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int m = 0; m < order_6; m++) { int mzy = m*nix + mz; FFT_SCALAR x0 = y0*rho[1][m]; #if defined(LMP_SIMD_COMPILER) + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #pragma simd #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + for (int l = 0; l < order; l++) { int mzyx = l + mzy; my_density[mzyx] += x0*rho[0][l]; } @@ -1181,21 +1183,22 @@ void PPPMDispIntel::make_rho_a(IntelBuffers *buffers) FFT_SCALAR z0 = fdelvolinv; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int n = 0; n < order_6; n++) { int mz = n + nzsum; FFT_SCALAR y0 = z0*rho[2][n]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int m = 0; m < order_6; m++) { int my = m + nysum; FFT_SCALAR x0 = y0*rho[1][m]; #if defined(LMP_SIMD_COMPILER) + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #pragma simd #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + for (int l = 0; l < order; l++) { int mx = l + nxsum; FFT_SCALAR w = x0*rho[0][l]; density_brick_a0[mz][my][mx] += w*B[7*type]; @@ -1314,21 +1317,22 @@ void PPPMDispIntel::make_rho_none(IntelBuffers *buffers) FFT_SCALAR z0 = fdelvolinv; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int n = 0; n < order_6; n++) { int mz = n*nix*niy + nzsum; FFT_SCALAR y0 = z0*rho[2][n]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int m = 0; m < order_6; m++) { int mzy = m*nix + mz; FFT_SCALAR x0 = y0*rho[1][m]; #if defined(LMP_SIMD_COMPILER) + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #pragma simd #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + for (int l = 0; l < order; l++) { int mzyx = l + mzy; FFT_SCALAR w0 = x0*rho[0][l]; for(int k = 0; k < nsplit; k++) @@ -1462,21 +1466,22 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers *buffers) _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int n = 0; n < order; n++) { int mz = n+nzsum; FFT_SCALAR z0 = rho2[n]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int m = 0; m < order; m++) { int my = m+nysum; FFT_SCALAR y0 = z0*rho1[m]; #if defined(LMP_SIMD_COMPILER) + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #pragma simd #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + for (int l = 0; l < order; l++) { int mx = l+nxsum; FFT_SCALAR x0 = y0*rho0[l]; ekx_arr[l] -= x0*vdx_brick[mz][my][mx]; @@ -1490,12 +1495,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers *buffers) FFT_SCALAR ekx, eky, ekz; ekx = eky = ekz = ZEROF; - - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - ekx += ekx_arr[l]; - eky += eky_arr[l]; - ekz += ekz_arr[l]; - } + for (int l = 0; l < order; l++) { + ekx += ekx_arr[l]; + eky += eky_arr[l]; + ekz += ekz_arr[l]; + } // convert E-field to force @@ -1643,12 +1647,12 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers *buffers) particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int n = 0; n < order; n++) { int mz = n + nzsum; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int m = 0; m < order; m++) { int my = m + nysum; @@ -1656,9 +1660,10 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers *buffers) FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; #if defined(LMP_SIMD_COMPILER) + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #pragma simd #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + for (int l = 0; l < order; l++) { int mx = l + nxsum; ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx]; eky[l] += rho[0][l] * eky_p * u_brick[mz][my][mx]; @@ -1668,9 +1673,9 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers *buffers) } #if defined(LMP_SIMD_COMPILER) - #pragma simd + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ + for (int l = 0; l < order; l++){ particle_ekx[i] += ekx[l]; particle_eky[i] += eky[l]; particle_ekz[i] += ekz[l]; @@ -1809,21 +1814,22 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers *buffers) _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int n = 0; n < order_6; n++) { int mz = n+nzsum; FFT_SCALAR z0 = rho2[n]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int m = 0; m < order_6; m++) { int my = m+nysum; FFT_SCALAR y0 = z0*rho1[m]; #if defined(LMP_SIMD_COMPILER) + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #pragma simd #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + for (int l = 0; l < order; l++) { int mx = l+nxsum; FFT_SCALAR x0 = y0*rho0[l]; ekx_arr[l] -= x0*vdx_brick_g[mz][my][mx]; @@ -1837,12 +1843,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers *buffers) FFT_SCALAR ekx, eky, ekz; ekx = eky = ekz = ZEROF; - - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - ekx += ekx_arr[l]; - eky += eky_arr[l]; - ekz += ekz_arr[l]; - } + for (int l = 0; l < order; l++) { + ekx += ekx_arr[l]; + eky += eky_arr[l]; + ekz += ekz_arr[l]; + } // convert E-field to force @@ -1985,12 +1990,12 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers *buffers) particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int n = 0; n < order_6; n++) { int mz = n + nzsum; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int m = 0; m < order_6; m++) { int my = m + nysum; @@ -1998,9 +2003,10 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers *buffers) FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; #if defined(LMP_SIMD_COMPILER) + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #pragma simd #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + for (int l = 0; l < order; l++) { int mx = l + nxsum; ekx[l] += drho[0][l] * ekx_p * u_brick_g[mz][my][mx]; eky[l] += rho[0][l] * eky_p * u_brick_g[mz][my][mx]; @@ -2010,9 +2016,9 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers *buffers) } #if defined(LMP_SIMD_COMPILER) - #pragma simd + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ + for (int l = 0; l < order; l++){ particle_ekx[i] += ekx[l]; particle_eky[i] += eky[l]; particle_ekz[i] += ekz[l]; @@ -2168,21 +2174,22 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers *buffers) #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int n = 0; n < order_6; n++) { int mz = n+nzsum; FFT_SCALAR z0 = rho2[n]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int m = 0; m < order_6; m++) { int my = m+nysum; FFT_SCALAR y0 = z0*rho1[m]; #if defined(LMP_SIMD_COMPILER) + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #pragma simd #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + for (int l = 0; l < order; l++) { int mx = l+nxsum; FFT_SCALAR x0 = y0*rho0[l]; ekx0_arr[l] -= x0*vdx_brick_a0[mz][my][mx]; @@ -2221,29 +2228,29 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers *buffers) ekx5 = eky5 = ekz5 = ZEROF; ekx6 = eky6 = ekz6 = ZEROF; - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - ekx0 += ekx0_arr[l]; - eky0 += eky0_arr[l]; - ekz0 += ekz0_arr[l]; - ekx1 += ekx1_arr[l]; - eky1 += eky1_arr[l]; - ekz1 += ekz1_arr[l]; - ekx2 += ekx2_arr[l]; - eky2 += eky2_arr[l]; - ekz2 += ekz2_arr[l]; - ekx3 += ekx3_arr[l]; - eky3 += eky3_arr[l]; - ekz3 += ekz3_arr[l]; - ekx4 += ekx4_arr[l]; - eky4 += eky4_arr[l]; - ekz4 += ekz4_arr[l]; - ekx5 += ekx5_arr[l]; - eky5 += eky5_arr[l]; - ekz5 += ekz5_arr[l]; - ekx6 += ekx6_arr[l]; - eky6 += eky6_arr[l]; - ekz6 += ekz6_arr[l]; - } + for (int l = 0; l < order; l++) { + ekx0 += ekx0_arr[l]; + eky0 += eky0_arr[l]; + ekz0 += ekz0_arr[l]; + ekx1 += ekx1_arr[l]; + eky1 += eky1_arr[l]; + ekz1 += ekz1_arr[l]; + ekx2 += ekx2_arr[l]; + eky2 += eky2_arr[l]; + ekz2 += ekz2_arr[l]; + ekx3 += ekx3_arr[l]; + eky3 += eky3_arr[l]; + ekz3 += ekz3_arr[l]; + ekx4 += ekx4_arr[l]; + eky4 += eky4_arr[l]; + ekz4 += ekz4_arr[l]; + ekx5 += ekx5_arr[l]; + eky5 += eky5_arr[l]; + ekz5 += ekz5_arr[l]; + ekx6 += ekx6_arr[l]; + eky6 += eky6_arr[l]; + ekz6 += ekz6_arr[l]; + } // convert D-field to force @@ -2439,12 +2446,12 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers *buffers) particle_ekx6[i] = particle_eky6[i] = particle_ekz6[i] = ZEROF; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int n = 0; n < order_6; n++) { int mz = n + nzsum; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int m = 0; m < order_6; m++) { int my = m + nysum; @@ -2452,9 +2459,10 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers *buffers) FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; #if defined(LMP_SIMD_COMPILER) + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #pragma simd #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + for (int l = 0; l < order; l++) { int mx = l + nxsum; FFT_SCALAR x0 = drho[0][l] * ekx_p; FFT_SCALAR y0 = rho[0][l] * eky_p; @@ -2486,9 +2494,9 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers *buffers) } #if defined(LMP_SIMD_COMPILER) - #pragma simd + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ + for (int l = 0; l < order; l++){ particle_ekx0[i] += ekx0[l]; particle_eky0[i] += eky0[l]; particle_ekz0[i] += ekz0[l]; @@ -2681,21 +2689,22 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers *buffers) for (int k = 0; k < nsplit; k++) { #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int n = 0; n < order_6; n++) { int mz = n+nzsum; FFT_SCALAR z0 = rho2[n]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int m = 0; m < order_6; m++) { int my = m+nysum; FFT_SCALAR y0 = z0*rho1[m]; #if defined(LMP_SIMD_COMPILER) + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #pragma simd #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + for (int l = 0; l < order; l++) { int mx = l+nxsum; FFT_SCALAR x0 = y0*rho0[l]; ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -= @@ -2716,13 +2725,13 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers *buffers) ekx[k] = eky[k] = ekz[k] = ZEROF; } - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - for (int k = 0; k < nsplit; k++) { - ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l]; - eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l]; - ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l]; - } - } + for (int l = 0; l < order; l++) { + for (int k = 0; k < nsplit; k++) { + ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l]; + eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l]; + ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l]; + } + } // convert E-field to force @@ -2867,12 +2876,12 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers *buffers) for (int k = 0; k < nsplit; k++) { particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int n = 0; n < order_6; n++) { int mz = n + nzsum; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int m = 0; m < order_6; m++) { int my = m + nysum; @@ -2880,9 +2889,10 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers *buffers) FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; #if defined(LMP_SIMD_COMPILER) + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #pragma simd #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + for (int l = 0; l < order; l++) { int mx = l + nxsum; ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l] += drho[0][l] * ekx_p * u_brick_none[k][mz][my][mx]; @@ -2903,9 +2913,9 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers *buffers) } #if defined(LMP_SIMD_COMPILER) - #pragma simd + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ + for (int l = 0; l < order; l++){ for (int k = 0; k < nsplit; k++) { ekx_tot[k] += ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l]; eky_tot[k] += eky[k*INTEL_P3M_ALIGNED_MAXORDER+l]; diff --git a/src/USER-INTEL/pppm_intel.cpp b/src/USER-INTEL/pppm_intel.cpp index 8416b6f3a3..f1cfe591f2 100644 --- a/src/USER-INTEL/pppm_intel.cpp +++ b/src/USER-INTEL/pppm_intel.cpp @@ -149,13 +149,13 @@ void PPPMIntel::init() memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out); memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out); memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out); - memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2, - nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1, - "pppmintel:vdxy_brick"); + create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2, + nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1, + "pppmintel:vdxy_brick"); memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out); - memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2, - nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1, - "pppmintel:vdz0_brick"); + create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2, + nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1, + "pppmintel:vdz0_brick"); memory->destroy(work3); memory->create(work3, 2*nfft_both, "pppmintel:work3"); @@ -555,13 +555,13 @@ void PPPMIntel::make_rho(IntelBuffers *buffers) FFT_SCALAR z0 = fdelvolinv * q[i]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int n = 0; n < order; n++) { int mz = n*nix*niy + nzsum; FFT_SCALAR y0 = z0*rho[2][n]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int m = 0; m < order; m++) { int mzy = m*nix + mz; @@ -708,13 +708,13 @@ void PPPMIntel::fieldforce_ik(IntelBuffers *buffers) _alignvar(FFT_SCALAR ekz0_arr[2 * INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int n = 0; n < order; n++) { int mz = n+nzsum; FFT_SCALAR z0 = rho2[n]; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int m = 0; m < order; m++) { int my = m+nysum; @@ -742,13 +742,13 @@ void PPPMIntel::fieldforce_ik(IntelBuffers *buffers) ekx = eky = ekz = ZEROF; if (use_packing) { - for (int l = 0; l < 2*INTEL_P3M_ALIGNED_MAXORDER; l += 2) { + for (int l = 0; l < 2*order; l += 2) { ekx += ekxy_arr[l]; eky += ekxy_arr[l+1]; ekz += ekz0_arr[l]; } } else { - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + for (int l = 0; l < order; l++) { ekx += ekx_arr[l]; eky += eky_arr[l]; ekz += ekz_arr[l]; @@ -896,12 +896,12 @@ void PPPMIntel::fieldforce_ad(IntelBuffers *buffers) particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int n = 0; n < order; n++) { int mz = n + nzsum; #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif for (int m = 0; m < order; m++) { int my = m + nysum; @@ -921,9 +921,9 @@ void PPPMIntel::fieldforce_ad(IntelBuffers *buffers) } #if defined(LMP_SIMD_COMPILER) - #pragma simd + #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7) #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ + for (int l = 0; l < order; l++){ particle_ekx[i] += ekx[l]; particle_eky[i] += eky[l]; particle_ekz[i] += ekz[l]; @@ -1240,6 +1240,73 @@ void PPPMIntel::pack_buffers() fix->stop_watch(TIME_PACK); } +/* ---------------------------------------------------------------------- + Allocate density_brick with extra padding for vector writes +------------------------------------------------------------------------- */ + +void PPPMIntel::allocate() +{ + PPPM::allocate(); + memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out); + create3d_offset(density_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:density_brick"); + + if (differentiation_flag == 1) { + memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out); + create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:u_brick"); + } else { + memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out); + create3d_offset(vdx_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:vdx_brick"); + create3d_offset(vdy_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:vdy_brick"); + create3d_offset(vdz_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:vdz_brick"); + } +} + +/* ---------------------------------------------------------------------- + Create 3D-offset allocation with extra padding for vector writes +------------------------------------------------------------------------- */ + +FFT_SCALAR *** PPPMIntel::create3d_offset(FFT_SCALAR ***&array, int n1lo, + int n1hi, int n2lo, int n2hi, + int n3lo, int n3hi, + const char *name) +{ + int n1 = n1hi - n1lo + 1; + int n2 = n2hi - n2lo + 1; + int n3 = n3hi - n3lo + 1; + + bigint nbytes = ((bigint) sizeof(FFT_SCALAR)) * n1*n2*n3 + + INTEL_P3M_ALIGNED_MAXORDER*2; + FFT_SCALAR *data = (FFT_SCALAR *) memory->smalloc(nbytes,name); + nbytes = ((bigint) sizeof(FFT_SCALAR *)) * n1*n2; + FFT_SCALAR **plane = (FFT_SCALAR **) memory->smalloc(nbytes,name); + nbytes = ((bigint) sizeof(FFT_SCALAR **)) * n1; + array = (FFT_SCALAR ***) memory->smalloc(nbytes,name); + + bigint m; + bigint n = 0; + for (int i = 0; i < n1; i++) { + m = ((bigint) i) * n2; + array[i] = &plane[m]; + for (int j = 0; j < n2; j++) { + plane[m+j] = &data[n]; + n += n3; + } + } + + m = ((bigint) n1) * n2; + for (bigint i = 0; i < m; i++) array[0][i] -= n3lo; + for (int i = 0; i < n1; i++) array[i] -= n2lo; + array -= n1lo; + return array; +} + /* ---------------------------------------------------------------------- Returns 0 if Intel optimizations for PPPM ignored due to offload ------------------------------------------------------------------------- */ diff --git a/src/USER-INTEL/pppm_intel.h b/src/USER-INTEL/pppm_intel.h index e152486b29..5bffabe0e5 100644 --- a/src/USER-INTEL/pppm_intel.h +++ b/src/USER-INTEL/pppm_intel.h @@ -74,9 +74,10 @@ class PPPMIntel : public PPPM { int _use_base; #endif - template - void test_function(IntelBuffers *buffers); + virtual void allocate(); + template + void test_function(IntelBuffers *buffers); void precompute_rho(); template @@ -120,6 +121,8 @@ class PPPMIntel : public PPPM { fieldforce_ad(buffers); } } + FFT_SCALAR ***create3d_offset(FFT_SCALAR ***&, int, int, int, + int, int, int, const char *name); }; }