Merge pull request #558 from lammps/intel

memory allocation bugfix for USER-INTEL pppm from M Brown
This commit is contained in:
sjplimp 2017-07-03 12:25:12 -06:00 committed by GitHub
commit 5c0c8bb4cd
11 changed files with 232 additions and 116 deletions

View File

@ -2,14 +2,18 @@
_Briefly describe the new feature(s), enhancement(s), or bugfix(es) included in this pull request. If this addresses an open GitHub Issue, mention the issue number, e.g. with `fixes #221` or `closes #135`, so that issue will be automatically closed when the pull request is merged_
## Implementation Notes
## Author(s)
_Provide any relevant details about how the changes are implemented, how correctness was verified, how other features - if any - in LAMMPS are affected_
_Please state name and affiliation of the author or authors that should be credited with the changes in this pull request_
## Backward Compatibility
_Please state whether any changes in the pull request break backward compatibility for inputs, and - if yes - explain what has been changed and why_
## Implementation Notes
_Provide any relevant details about how the changes are implemented, how correctness was verified, how other features - if any - in LAMMPS are affected_
## Post Submission Checklist
_Please check the fields below as they are completed_

View File

@ -106,6 +106,8 @@ $t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l
For some of the simple 2-body potentials without long-range
electrostatics, performance and scalability can be better with
the "newton off" setting added to the input script :l
For simulations on higher node counts, add "processors * * * grid
numa" to the beginning of the input script for better scalability :l
If using {kspace_style pppm} in the input script, add
"kspace_modify diff ad" for better performance :l
:ule
@ -392,6 +394,10 @@ hybrid intel omp"_suffix.html command can also be used within the
input script to automatically append the "omp" suffix to styles when
USER-INTEL styles are not available.
NOTE: For simulations on higher node counts, add "processors * * *
grid numa"_processors.html" to the beginning of the input script for
better scalability.
When running on many nodes, performance might be better when using
fewer OpenMP threads and more MPI tasks. This will depend on the
simulation and the machine. Using the "verlet/split"_run_style.html

View File

@ -14,7 +14,7 @@ read_data file keyword args ... :pre
file = name of data file to read in :ulb,l
zero or more keyword/arg pairs may be appended :l
keyword = {add} or {offset} or {shift} or {extra/atom/types} or {extra/bond/types} or {extra/angle/types} or {extra/dihedral/types} or {extra/improper/types} or {group} or {nocoeff} or {fix} :l
keyword = {add} or {offset} or {shift} or {extra/atom/types} or {extra/bond/types} or {extra/angle/types} or {extra/dihedral/types} or {extra/improper/types} or {extra/bond/per/atom} or {extra/angle/per/atom} or {extra/dihedral/per/atom} or {extra/improper/per/atom} or {group} or {nocoeff} or {fix} :l
{add} arg = {append} or {Nstart} or {merge}
append = add new atoms with IDs appended to current IDs
Nstart = add new atoms with IDs starting with Nstart

View File

@ -32,7 +32,7 @@ bond_coeff 1 10.0 1.2
# need to preserve 1-3, 1-4 pairwise interactions during hard collisions
special_bonds lj/coul 0 1 1
create_bonds all all 1 1.0 1.5
create_bonds many all all 1 1.0 1.5
neighbor 0.3 bin
neigh_modify delay 0 every 1 check yes

View File

@ -32,7 +32,7 @@ pair_coeff 1 1 10.0 1.0 2.5
bond_style harmonic
bond_coeff 1 10.0 1.2
create_bonds all all 1 1.0 1.5
create_bonds many all all 1 1.0 1.5
neighbor 0.3 bin
neigh_modify delay 0 every 1 check yes

View File

@ -748,7 +748,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
if (eatom) {
double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
#if defined(LMP_SIMD_COMPILER)
#pragma novector
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
f[i].x += f_in[ii].x;
@ -762,7 +763,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
}
} else {
#if defined(LMP_SIMD_COMPILER)
#pragma novector
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
f[i].x += f_in[ii].x;
@ -778,7 +780,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
if (eatom) {
double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
#if defined(LMP_SIMD_COMPILER)
#pragma novector
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
f[i].x += f_in[i].x;
@ -788,7 +791,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
}
} else {
#if defined(LMP_SIMD_COMPILER)
#pragma novector
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
f[i].x += f_in[i].x;

View File

@ -172,6 +172,10 @@ class IntelBuffers {
inline void thr_pack(const int ifrom, const int ito, const int ago) {
if (ago == 0) {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
_x[i].x = lmp->atom->x[i][0];
_x[i].y = lmp->atom->x[i][1];
@ -179,9 +183,17 @@ class IntelBuffers {
_x[i].w = lmp->atom->type[i];
}
if (lmp->atom->q != NULL)
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++)
_q[i] = lmp->atom->q[i];
} else {
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
_x[i].x = lmp->atom->x[i][0];
_x[i].y = lmp->atom->x[i][1];
@ -204,7 +216,10 @@ class IntelBuffers {
const int offset, const bool dotype = false) {
double ** x = lmp->atom->x + offset;
if (dotype == false) {
#pragma vector nontemporal
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
_x[i].x = x[i][0];
_x[i].y = x[i][1];
@ -212,7 +227,10 @@ class IntelBuffers {
}
} else {
int *type = lmp->atom->type + offset;
#pragma vector nontemporal
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
_x[i].x = x[i][0];
_x[i].y = x[i][1];
@ -225,6 +243,10 @@ class IntelBuffers {
inline void thr_pack_host(const int ifrom, const int ito,
const int offset) {
double ** x = lmp->atom->x + offset;
#if defined(LMP_SIMD_COMPILER)
#pragma vector aligned
#pragma ivdep
#endif
for (int i = ifrom; i < ito; i++) {
_host_x[i].x = x[i][0];
_host_x[i].y = x[i][1];

View File

@ -68,7 +68,7 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
#define INTEL_MAX_STENCIL 256
// INTEL_MAX_STENCIL * sqrt(INTEL_MAX_STENCIL)
#define INTEL_MAX_STENCIL_CHECK 4096
#define INTEL_P3M_MAXORDER 7
#define INTEL_P3M_MAXORDER 8
#define INTEL_P3M_ALIGNED_MAXORDER 8
// PRECOMPUTE VALUES IN TABLE (DOESN'T AFFECT ACCURACY)
#define INTEL_P3M_TABLE 1

View File

@ -885,21 +885,22 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR z0 = fdelvolinv * q[i];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order; n++) {
int mz = n*nix*niy + nzsum;
FFT_SCALAR y0 = z0*rho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order; m++) {
int mzy = m*nix + mz;
FFT_SCALAR x0 = y0*rho[1][m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mzyx = l + mzy;
my_density[mzyx] += x0*rho[0][l];
}
@ -1034,21 +1035,22 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR z0 = fdelvolinv * B[type];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n*nix*niy + nzsum;
FFT_SCALAR y0 = z0*rho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int mzy = m*nix + mz;
FFT_SCALAR x0 = y0*rho[1][m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mzyx = l + mzy;
my_density[mzyx] += x0*rho[0][l];
}
@ -1181,21 +1183,22 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR z0 = fdelvolinv;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n + nzsum;
FFT_SCALAR y0 = z0*rho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int my = m + nysum;
FFT_SCALAR x0 = y0*rho[1][m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l + nxsum;
FFT_SCALAR w = x0*rho[0][l];
density_brick_a0[mz][my][mx] += w*B[7*type];
@ -1314,21 +1317,22 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR z0 = fdelvolinv;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n*nix*niy + nzsum;
FFT_SCALAR y0 = z0*rho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int mzy = m*nix + mz;
FFT_SCALAR x0 = y0*rho[1][m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mzyx = l + mzy;
FFT_SCALAR w0 = x0*rho[0][l];
for(int k = 0; k < nsplit; k++)
@ -1462,21 +1466,22 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
_alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order; n++) {
int mz = n+nzsum;
FFT_SCALAR z0 = rho2[n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order; m++) {
int my = m+nysum;
FFT_SCALAR y0 = z0*rho1[m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l+nxsum;
FFT_SCALAR x0 = y0*rho0[l];
ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
@ -1490,12 +1495,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR ekx, eky, ekz;
ekx = eky = ekz = ZEROF;
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
ekx += ekx_arr[l];
eky += eky_arr[l];
ekz += ekz_arr[l];
}
for (int l = 0; l < order; l++) {
ekx += ekx_arr[l];
eky += eky_arr[l];
ekz += ekz_arr[l];
}
// convert E-field to force
@ -1643,12 +1647,12 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order; n++) {
int mz = n + nzsum;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order; m++) {
int my = m + nysum;
@ -1656,9 +1660,10 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l + nxsum;
ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
eky[l] += rho[0][l] * eky_p * u_brick[mz][my][mx];
@ -1668,9 +1673,9 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
}
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
for (int l = 0; l < order; l++){
particle_ekx[i] += ekx[l];
particle_eky[i] += eky[l];
particle_ekz[i] += ekz[l];
@ -1809,21 +1814,22 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
_alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n+nzsum;
FFT_SCALAR z0 = rho2[n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int my = m+nysum;
FFT_SCALAR y0 = z0*rho1[m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l+nxsum;
FFT_SCALAR x0 = y0*rho0[l];
ekx_arr[l] -= x0*vdx_brick_g[mz][my][mx];
@ -1837,12 +1843,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR ekx, eky, ekz;
ekx = eky = ekz = ZEROF;
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
ekx += ekx_arr[l];
eky += eky_arr[l];
ekz += ekz_arr[l];
}
for (int l = 0; l < order; l++) {
ekx += ekx_arr[l];
eky += eky_arr[l];
ekz += ekz_arr[l];
}
// convert E-field to force
@ -1985,12 +1990,12 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n + nzsum;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int my = m + nysum;
@ -1998,9 +2003,10 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l + nxsum;
ekx[l] += drho[0][l] * ekx_p * u_brick_g[mz][my][mx];
eky[l] += rho[0][l] * eky_p * u_brick_g[mz][my][mx];
@ -2010,9 +2016,9 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
}
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
for (int l = 0; l < order; l++){
particle_ekx[i] += ekx[l];
particle_eky[i] += eky[l];
particle_ekz[i] += ekz[l];
@ -2168,21 +2174,22 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n+nzsum;
FFT_SCALAR z0 = rho2[n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int my = m+nysum;
FFT_SCALAR y0 = z0*rho1[m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l+nxsum;
FFT_SCALAR x0 = y0*rho0[l];
ekx0_arr[l] -= x0*vdx_brick_a0[mz][my][mx];
@ -2221,29 +2228,29 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)
ekx5 = eky5 = ekz5 = ZEROF;
ekx6 = eky6 = ekz6 = ZEROF;
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
ekx0 += ekx0_arr[l];
eky0 += eky0_arr[l];
ekz0 += ekz0_arr[l];
ekx1 += ekx1_arr[l];
eky1 += eky1_arr[l];
ekz1 += ekz1_arr[l];
ekx2 += ekx2_arr[l];
eky2 += eky2_arr[l];
ekz2 += ekz2_arr[l];
ekx3 += ekx3_arr[l];
eky3 += eky3_arr[l];
ekz3 += ekz3_arr[l];
ekx4 += ekx4_arr[l];
eky4 += eky4_arr[l];
ekz4 += ekz4_arr[l];
ekx5 += ekx5_arr[l];
eky5 += eky5_arr[l];
ekz5 += ekz5_arr[l];
ekx6 += ekx6_arr[l];
eky6 += eky6_arr[l];
ekz6 += ekz6_arr[l];
}
for (int l = 0; l < order; l++) {
ekx0 += ekx0_arr[l];
eky0 += eky0_arr[l];
ekz0 += ekz0_arr[l];
ekx1 += ekx1_arr[l];
eky1 += eky1_arr[l];
ekz1 += ekz1_arr[l];
ekx2 += ekx2_arr[l];
eky2 += eky2_arr[l];
ekz2 += ekz2_arr[l];
ekx3 += ekx3_arr[l];
eky3 += eky3_arr[l];
ekz3 += ekz3_arr[l];
ekx4 += ekx4_arr[l];
eky4 += eky4_arr[l];
ekz4 += ekz4_arr[l];
ekx5 += ekx5_arr[l];
eky5 += eky5_arr[l];
ekz5 += ekz5_arr[l];
ekx6 += ekx6_arr[l];
eky6 += eky6_arr[l];
ekz6 += ekz6_arr[l];
}
// convert D-field to force
@ -2439,12 +2446,12 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
particle_ekx6[i] = particle_eky6[i] = particle_ekz6[i] = ZEROF;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n + nzsum;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int my = m + nysum;
@ -2452,9 +2459,10 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l + nxsum;
FFT_SCALAR x0 = drho[0][l] * ekx_p;
FFT_SCALAR y0 = rho[0][l] * eky_p;
@ -2486,9 +2494,9 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
}
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
for (int l = 0; l < order; l++){
particle_ekx0[i] += ekx0[l];
particle_eky0[i] += eky0[l];
particle_ekz0[i] += ekz0[l];
@ -2681,21 +2689,22 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)
for (int k = 0; k < nsplit; k++) {
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n+nzsum;
FFT_SCALAR z0 = rho2[n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int my = m+nysum;
FFT_SCALAR y0 = z0*rho1[m];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l+nxsum;
FFT_SCALAR x0 = y0*rho0[l];
ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
@ -2716,13 +2725,13 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)
ekx[k] = eky[k] = ekz[k] = ZEROF;
}
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int k = 0; k < nsplit; k++) {
ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
}
}
for (int l = 0; l < order; l++) {
for (int k = 0; k < nsplit; k++) {
ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
}
}
// convert E-field to force
@ -2867,12 +2876,12 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
for (int k = 0; k < nsplit; k++) {
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order_6; n++) {
int mz = n + nzsum;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order_6; m++) {
int my = m + nysum;
@ -2880,9 +2889,10 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#pragma simd
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
int mx = l + nxsum;
ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l] += drho[0][l] * ekx_p *
u_brick_none[k][mz][my][mx];
@ -2903,9 +2913,9 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
}
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
for (int l = 0; l < order; l++){
for (int k = 0; k < nsplit; k++) {
ekx_tot[k] += ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l];
eky_tot[k] += eky[k*INTEL_P3M_ALIGNED_MAXORDER+l];

View File

@ -149,13 +149,13 @@ void PPPMIntel::init()
memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
"pppmintel:vdxy_brick");
create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
"pppmintel:vdxy_brick");
memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
"pppmintel:vdz0_brick");
create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
"pppmintel:vdz0_brick");
memory->destroy(work3);
memory->create(work3, 2*nfft_both, "pppmintel:work3");
@ -555,13 +555,13 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
FFT_SCALAR z0 = fdelvolinv * q[i];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order; n++) {
int mz = n*nix*niy + nzsum;
FFT_SCALAR y0 = z0*rho[2][n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order; m++) {
int mzy = m*nix + mz;
@ -708,13 +708,13 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
_alignvar(FFT_SCALAR ekz0_arr[2 * INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order; n++) {
int mz = n+nzsum;
FFT_SCALAR z0 = rho2[n];
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order; m++) {
int my = m+nysum;
@ -742,13 +742,13 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
ekx = eky = ekz = ZEROF;
if (use_packing) {
for (int l = 0; l < 2*INTEL_P3M_ALIGNED_MAXORDER; l += 2) {
for (int l = 0; l < 2*order; l += 2) {
ekx += ekxy_arr[l];
eky += ekxy_arr[l+1];
ekz += ekz0_arr[l];
}
} else {
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
for (int l = 0; l < order; l++) {
ekx += ekx_arr[l];
eky += eky_arr[l];
ekz += ekz_arr[l];
@ -896,12 +896,12 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int n = 0; n < order; n++) {
int mz = n + nzsum;
#if defined(LMP_SIMD_COMPILER)
#pragma loop_count=7
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int m = 0; m < order; m++) {
int my = m + nysum;
@ -921,9 +921,9 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
}
#if defined(LMP_SIMD_COMPILER)
#pragma simd
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
#endif
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
for (int l = 0; l < order; l++){
particle_ekx[i] += ekx[l];
particle_eky[i] += eky[l];
particle_ekz[i] += ekz[l];
@ -1240,6 +1240,73 @@ void PPPMIntel::pack_buffers()
fix->stop_watch(TIME_PACK);
}
/* ----------------------------------------------------------------------
Allocate density_brick with extra padding for vector writes
------------------------------------------------------------------------- */
void PPPMIntel::allocate()
{
PPPM::allocate();
memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
create3d_offset(density_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
nxlo_out,nxhi_out,"pppm:density_brick");
if (differentiation_flag == 1) {
memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out);
create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
nxlo_out,nxhi_out,"pppm:u_brick");
} else {
memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
create3d_offset(vdx_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
nxlo_out,nxhi_out,"pppm:vdx_brick");
create3d_offset(vdy_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
nxlo_out,nxhi_out,"pppm:vdy_brick");
create3d_offset(vdz_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
nxlo_out,nxhi_out,"pppm:vdz_brick");
}
}
/* ----------------------------------------------------------------------
Create 3D-offset allocation with extra padding for vector writes
------------------------------------------------------------------------- */
FFT_SCALAR *** PPPMIntel::create3d_offset(FFT_SCALAR ***&array, int n1lo,
int n1hi, int n2lo, int n2hi,
int n3lo, int n3hi,
const char *name)
{
int n1 = n1hi - n1lo + 1;
int n2 = n2hi - n2lo + 1;
int n3 = n3hi - n3lo + 1;
bigint nbytes = ((bigint) sizeof(FFT_SCALAR)) * n1*n2*n3 +
INTEL_P3M_ALIGNED_MAXORDER*2;
FFT_SCALAR *data = (FFT_SCALAR *) memory->smalloc(nbytes,name);
nbytes = ((bigint) sizeof(FFT_SCALAR *)) * n1*n2;
FFT_SCALAR **plane = (FFT_SCALAR **) memory->smalloc(nbytes,name);
nbytes = ((bigint) sizeof(FFT_SCALAR **)) * n1;
array = (FFT_SCALAR ***) memory->smalloc(nbytes,name);
bigint m;
bigint n = 0;
for (int i = 0; i < n1; i++) {
m = ((bigint) i) * n2;
array[i] = &plane[m];
for (int j = 0; j < n2; j++) {
plane[m+j] = &data[n];
n += n3;
}
}
m = ((bigint) n1) * n2;
for (bigint i = 0; i < m; i++) array[0][i] -= n3lo;
for (int i = 0; i < n1; i++) array[i] -= n2lo;
array -= n1lo;
return array;
}
/* ----------------------------------------------------------------------
Returns 0 if Intel optimizations for PPPM ignored due to offload
------------------------------------------------------------------------- */

View File

@ -74,9 +74,10 @@ class PPPMIntel : public PPPM {
int _use_base;
#endif
template<class flt_t, class acc_t>
void test_function(IntelBuffers<flt_t,acc_t> *buffers);
virtual void allocate();
template<class flt_t, class acc_t>
void test_function(IntelBuffers<flt_t,acc_t> *buffers);
void precompute_rho();
template<class flt_t, class acc_t>
@ -120,6 +121,8 @@ class PPPMIntel : public PPPM {
fieldforce_ad<flt_t,acc_t,0>(buffers);
}
}
FFT_SCALAR ***create3d_offset(FFT_SCALAR ***&, int, int, int,
int, int, int, const char *name);
};
}