memory allocation bugfix for USER-INTEL pppm from M Brown

This commit is contained in:
Steve Plimpton 2017-07-03 08:53:53 -06:00
parent 883b7aaa0e
commit e634c5a2de
8 changed files with 239 additions and 112 deletions

View File

@ -106,6 +106,8 @@ $t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l
For some of the simple 2-body potentials without long-range
electrostatics, performance and scalability can be better with
the "newton off" setting added to the input script :l
For simulations on higher node counts, add "processors * * * grid
numa" to the beginning of the input script for better scalability :l
If using {kspace_style pppm} in the input script, add
"kspace_modify diff ad" for better performance :l
:ule
@ -392,6 +394,10 @@ hybrid intel omp"_suffix.html command can also be used within the
input script to automatically append the "omp" suffix to styles when
USER-INTEL styles are not available.
NOTE: For simulations on higher node counts, add "processors * * *
grid numa"_processors.html" to the beginning of the input script for
better scalability.
When running on many nodes, performance might be better when using
fewer OpenMP threads and more MPI tasks. This will depend on the
simulation and the machine. Using the "verlet/split"_run_style.html

View File

@ -14,7 +14,7 @@ read_data file keyword args ... :pre
file = name of data file to read in :ulb,l
zero or more keyword/arg pairs may be appended :l
keyword = {add} or {offset} or {shift} or {extra/atom/types} or {extra/bond/types} or {extra/angle/types} or {extra/dihedral/types} or {extra/improper/types} or {group} or {nocoeff} or {fix} :l
keyword = {add} or {offset} or {shift} or {extra/atom/types} or {extra/bond/types} or {extra/angle/types} or {extra/dihedral/types} or {extra/improper/types} or {extra/bond/per/atom} or {extra/angle/per/atom} or {extra/dihedral/per/atom} or {extra/improper/per/atom} or {group} or {nocoeff} or {fix} :l
{add} arg = {append} or {Nstart} or {merge}
append = add new atoms with IDs appended to current IDs
Nstart = add new atoms with IDs starting with Nstart

View File

@ -748,7 +748,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
if (eatom) {
double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
#if defined(LMP_SIMD_COMPILER)
#pragma novector
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
f[i].x += f_in[ii].x;
@ -762,7 +763,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
}
} else {
#if defined(LMP_SIMD_COMPILER)
#pragma novector
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
f[i].x += f_in[ii].x;
@ -778,7 +780,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
if (eatom) {
double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
#if defined(LMP_SIMD_COMPILER)
#pragma novector
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
f[i].x += f_in[i].x;
@ -788,7 +791,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
}
} else {
#if defined(LMP_SIMD_COMPILER)
#pragma novector
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
f[i].x += f_in[i].x;

View File

@ -172,6 +172,10 @@ class IntelBuffers {
inline void thr_pack(const int ifrom, const int ito, const int ago) {
if (ago == 0) {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
_x[i].x = lmp->atom->x[i][0];
_x[i].y = lmp->atom->x[i][1];
@ -179,9 +183,17 @@ class IntelBuffers {
_x[i].w = lmp->atom->type[i];
}
if (lmp->atom->q != NULL)
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++)
_q[i] = lmp->atom->q[i];
} else {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
_x[i].x = lmp->atom->x[i][0];
_x[i].y = lmp->atom->x[i][1];
@ -204,7 +216,10 @@ class IntelBuffers {
const int offset, const bool dotype = false) {
double ** x = lmp->atom->x + offset;
if (dotype == false) {
#pragma vector nontemporal
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
_x[i].x = x[i][0];
_x[i].y = x[i][1];
@ -212,7 +227,10 @@ class IntelBuffers {
}
} else {
int *type = lmp->atom->type + offset;
#pragma vector nontemporal
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
_x[i].x = x[i][0];
_x[i].y = x[i][1];
@ -225,6 +243,10 @@ class IntelBuffers {
inline void thr_pack_host(const int ifrom, const int ito,
const int offset) {
double ** x = lmp->atom->x + offset;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
_host_x[i].x = x[i][0];
_host_x[i].y = x[i][1];

View File

@ -68,7 +68,7 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
#define INTEL_MAX_STENCIL 256
// INTEL_MAX_STENCIL * sqrt(INTEL_MAX_STENCIL)
#define INTEL_MAX_STENCIL_CHECK 4096
#define INTEL_P3M_MAXORDER 7
#define INTEL_P3M_MAXORDER 8
#define INTEL_P3M_ALIGNED_MAXORDER 8
// PRECOMPUTE VALUES IN TABLE (DOESN'T AFFECT ACCURACY)
#define INTEL_P3M_TABLE 1
@ -248,6 +248,12 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
#else
#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \
{ \
ifrom = 0; \
ito = inum; \
}
#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads) \
{ \
tid = 0; \
@ -293,6 +299,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
ito = inum; \
}
#define IP_PRE_omp_range_id_vec(ifrom, ip, ito, tid, inum, \
nthreads, vecsize) \
{ \
tid = 0; \
ifrom = 0; \
ito = inum; \
ip = vecsize; \
}
#endif
#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \

View File

@ -885,21 +885,22 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR z0 = fdelvolinv * q[i];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order; n++) {
int mz = n*nix*niy + nzsum;
FFT_SCALAR y0 = z0*rho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order; m++) {
int mzy = m*nix + mz;
FFT_SCALAR x0 = y0*rho[1][m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mzyx = l + mzy;
my_density[mzyx] += x0*rho[0][l];
}
@ -1034,21 +1035,22 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR z0 = fdelvolinv * B[type];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n*nix*niy + nzsum;
FFT_SCALAR y0 = z0*rho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int mzy = m*nix + mz;
FFT_SCALAR x0 = y0*rho[1][m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mzyx = l + mzy;
my_density[mzyx] += x0*rho[0][l];
}
@ -1181,21 +1183,22 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR z0 = fdelvolinv;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n + nzsum;
FFT_SCALAR y0 = z0*rho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int my = m + nysum;
FFT_SCALAR x0 = y0*rho[1][m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l + nxsum;
FFT_SCALAR w = x0*rho[0][l];
density_brick_a0[mz][my][mx] += w*B[7*type];
@ -1314,21 +1317,22 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR z0 = fdelvolinv;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n*nix*niy + nzsum;
FFT_SCALAR y0 = z0*rho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int mzy = m*nix + mz;
FFT_SCALAR x0 = y0*rho[1][m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mzyx = l + mzy;
FFT_SCALAR w0 = x0*rho[0][l];
for(int k = 0; k < nsplit; k++)
@ -1462,21 +1466,22 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
_alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order; n++) {
int mz = n+nzsum;
FFT_SCALAR z0 = rho2[n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order; m++) {
int my = m+nysum;
FFT_SCALAR y0 = z0*rho1[m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l+nxsum;
FFT_SCALAR x0 = y0*rho0[l];
ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
@ -1490,8 +1495,7 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR ekx, eky, ekz;
ekx = eky = ekz = ZEROF;
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
ekx += ekx_arr[l];
eky += eky_arr[l];
ekz += ekz_arr[l];
@ -1643,12 +1647,12 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order; n++) {
int mz = n + nzsum;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order; m++) {
int my = m + nysum;
@ -1656,9 +1660,10 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l + nxsum;
ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
eky[l] += rho[0][l] * eky_p * u_brick[mz][my][mx];
@ -1668,9 +1673,9 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
}
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
for (int l = 0; l < order; l++){
particle_ekx[i] += ekx[l];
particle_eky[i] += eky[l];
particle_ekz[i] += ekz[l];
@ -1809,21 +1814,22 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
_alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n+nzsum;
FFT_SCALAR z0 = rho2[n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int my = m+nysum;
FFT_SCALAR y0 = z0*rho1[m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l+nxsum;
FFT_SCALAR x0 = y0*rho0[l];
ekx_arr[l] -= x0*vdx_brick_g[mz][my][mx];
@ -1837,8 +1843,7 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR ekx, eky, ekz;
ekx = eky = ekz = ZEROF;
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
ekx += ekx_arr[l];
eky += eky_arr[l];
ekz += ekz_arr[l];
@ -1985,12 +1990,12 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n + nzsum;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int my = m + nysum;
@ -1998,9 +2003,10 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l + nxsum;
ekx[l] += drho[0][l] * ekx_p * u_brick_g[mz][my][mx];
eky[l] += rho[0][l] * eky_p * u_brick_g[mz][my][mx];
@ -2010,9 +2016,9 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
}
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
for (int l = 0; l < order; l++){
particle_ekx[i] += ekx[l];
particle_eky[i] += eky[l];
particle_ekz[i] += ekz[l];
@ -2168,21 +2174,22 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n+nzsum;
FFT_SCALAR z0 = rho2[n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int my = m+nysum;
FFT_SCALAR y0 = z0*rho1[m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l+nxsum;
FFT_SCALAR x0 = y0*rho0[l];
ekx0_arr[l] -= x0*vdx_brick_a0[mz][my][mx];
@ -2221,7 +2228,7 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)
ekx5 = eky5 = ekz5 = ZEROF;
ekx6 = eky6 = ekz6 = ZEROF;
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
ekx0 += ekx0_arr[l];
eky0 += eky0_arr[l];
ekz0 += ekz0_arr[l];
@ -2439,12 +2446,12 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
particle_ekx6[i] = particle_eky6[i] = particle_ekz6[i] = ZEROF;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n + nzsum;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int my = m + nysum;
@ -2452,9 +2459,10 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l + nxsum;
FFT_SCALAR x0 = drho[0][l] * ekx_p;
FFT_SCALAR y0 = rho[0][l] * eky_p;
@ -2486,9 +2494,9 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
}
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
for (int l = 0; l < order; l++){
particle_ekx0[i] += ekx0[l];
particle_eky0[i] += eky0[l];
particle_ekz0[i] += ekz0[l];
@ -2681,21 +2689,22 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)
for (int k = 0; k < nsplit; k++) {
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n+nzsum;
FFT_SCALAR z0 = rho2[n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int my = m+nysum;
FFT_SCALAR y0 = z0*rho1[m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l+nxsum;
FFT_SCALAR x0 = y0*rho0[l];
ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
@ -2716,7 +2725,7 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)
ekx[k] = eky[k] = ekz[k] = ZEROF;
}
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
for (int k = 0; k < nsplit; k++) {
ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
@ -2867,12 +2876,12 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
for (int k = 0; k < nsplit; k++) {
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n + nzsum;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int my = m + nysum;
@ -2880,9 +2889,10 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l + nxsum;
ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l] += drho[0][l] * ekx_p *
u_brick_none[k][mz][my][mx];
@ -2903,9 +2913,9 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
}
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
for (int l = 0; l < order; l++){
for (int k = 0; k < nsplit; k++) {
ekx_tot[k] += ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l];
eky_tot[k] += eky[k*INTEL_P3M_ALIGNED_MAXORDER+l];

View File

@ -149,11 +149,11 @@ void PPPMIntel::init()
memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
"pppmintel:vdxy_brick");
memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
"pppmintel:vdz0_brick");
memory->destroy(work3);
@ -555,13 +555,13 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR z0 = fdelvolinv * q[i];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order; n++) {
int mz = n*nix*niy + nzsum;
FFT_SCALAR y0 = z0*rho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order; m++) {
int mzy = m*nix + mz;
@ -708,13 +708,13 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
_alignvar(FFT_SCALAR ekz0_arr[2 * INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order; n++) {
int mz = n+nzsum;
FFT_SCALAR z0 = rho2[n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order; m++) {
int my = m+nysum;
@ -742,13 +742,13 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
ekx = eky = ekz = ZEROF;
if (use_packing) {
for (int l = 0; l < 2*INTEL_P3M_ALIGNED_MAXORDER; l += 2) {
for (int l = 0; l < 2*order; l += 2) {
ekx += ekxy_arr[l];
eky += ekxy_arr[l+1];
ekz += ekz0_arr[l];
}
} else {
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
ekx += ekx_arr[l];
eky += eky_arr[l];
ekz += ekz_arr[l];
@ -896,12 +896,12 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order; n++) {
int mz = n + nzsum;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order; m++) {
int my = m + nysum;
@ -921,9 +921,9 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
}
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
for (int l = 0; l < order; l++){
particle_ekx[i] += ekx[l];
particle_eky[i] += eky[l];
particle_ekz[i] += ekz[l];
@ -1240,6 +1240,73 @@ void PPPMIntel::pack_buffers()
fix->stop_watch(TIME_PACK);
}
/* ----------------------------------------------------------------------
Allocate density_brick with extra padding for vector writes
------------------------------------------------------------------------- */
void PPPMIntel::allocate()
{
PPPM::allocate();
memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
create3d_offset(density_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
nxlo_out,nxhi_out,"pppm:density_brick");
if (differentiation_flag == 1) {
memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out);
create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
nxlo_out,nxhi_out,"pppm:u_brick");
} else {
memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
create3d_offset(vdx_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
nxlo_out,nxhi_out,"pppm:vdx_brick");
create3d_offset(vdy_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
nxlo_out,nxhi_out,"pppm:vdy_brick");
create3d_offset(vdz_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
nxlo_out,nxhi_out,"pppm:vdz_brick");
}
}
/* ----------------------------------------------------------------------
Create 3D-offset allocation with extra padding for vector writes
------------------------------------------------------------------------- */
FFT_SCALAR *** PPPMIntel::create3d_offset(FFT_SCALAR ***&array, int n1lo,
int n1hi, int n2lo, int n2hi,
int n3lo, int n3hi,
const char *name)
{
int n1 = n1hi - n1lo + 1;
int n2 = n2hi - n2lo + 1;
int n3 = n3hi - n3lo + 1;
bigint nbytes = ((bigint) sizeof(FFT_SCALAR)) * n1*n2*n3 +
INTEL_P3M_ALIGNED_MAXORDER*2;
FFT_SCALAR *data = (FFT_SCALAR *) memory->smalloc(nbytes,name);
nbytes = ((bigint) sizeof(FFT_SCALAR *)) * n1*n2;
FFT_SCALAR **plane = (FFT_SCALAR **) memory->smalloc(nbytes,name);
nbytes = ((bigint) sizeof(FFT_SCALAR **)) * n1;
array = (FFT_SCALAR ***) memory->smalloc(nbytes,name);
bigint m;
bigint n = 0;
for (int i = 0; i < n1; i++) {
m = ((bigint) i) * n2;
array[i] = &plane[m];
for (int j = 0; j < n2; j++) {
plane[m+j] = &data[n];
n += n3;
}
}
m = ((bigint) n1) * n2;
for (bigint i = 0; i < m; i++) array[0][i] -= n3lo;
for (int i = 0; i < n1; i++) array[i] -= n2lo;
array -= n1lo;
return array;
}
/* ----------------------------------------------------------------------
Returns 0 if Intel optimizations for PPPM ignored due to offload
------------------------------------------------------------------------- */

View File

@ -74,10 +74,11 @@ class PPPMIntel : public PPPM {
int _use_base;
#endif
virtual void allocate();
template<class flt_t, class acc_t>
void test_function(IntelBuffers<flt_t,acc_t> *buffers);
void precompute_rho();
template<class flt_t, class acc_t>
void particle_map(IntelBuffers<flt_t,acc_t> *buffers);
@ -120,6 +121,8 @@ class PPPMIntel : public PPPM {
fieldforce_ad<flt_t,acc_t,0>(buffers);
}
}
FFT_SCALAR ***create3d_offset(FFT_SCALAR ***&, int, int, int,
int, int, int, const char *name);
};
}