forked from lijiext/lammps
Merge pull request #558 from lammps/intel
memory allocation bugfix for USER-INTEL pppm from M Brown
This commit is contained in:
commit
5c0c8bb4cd
|
@ -2,14 +2,18 @@
|
|||
|
||||
_Briefly describe the new feature(s), enhancement(s), or bugfix(es) included in this pull request. If this addresses an open GitHub Issue, mention the issue number, e.g. with `fixes #221` or `closes #135`, so that issue will be automatically closed when the pull request is merged_
|
||||
|
||||
## Implementation Notes
|
||||
## Author(s)
|
||||
|
||||
_Provide any relevant details about how the changes are implemented, how correctness was verified, how other features - if any - in LAMMPS are affected_
|
||||
_Please state name and affiliation of the author or authors that should be credited with the changes in this pull request_
|
||||
|
||||
## Backward Compatibility
|
||||
|
||||
_Please state whether any changes in the pull request break backward compatibility for inputs, and - if yes - explain what has been changed and why_
|
||||
|
||||
## Implementation Notes
|
||||
|
||||
_Provide any relevant details about how the changes are implemented, how correctness was verified, how other features - if any - in LAMMPS are affected_
|
||||
|
||||
## Post Submission Checklist
|
||||
|
||||
_Please check the fields below as they are completed_
|
||||
|
|
|
@ -106,6 +106,8 @@ $t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l
|
|||
For some of the simple 2-body potentials without long-range
|
||||
electrostatics, performance and scalability can be better with
|
||||
the "newton off" setting added to the input script :l
|
||||
For simulations on higher node counts, add "processors * * * grid
|
||||
numa" to the beginning of the input script for better scalability :l
|
||||
If using {kspace_style pppm} in the input script, add
|
||||
"kspace_modify diff ad" for better performance :l
|
||||
:ule
|
||||
|
@ -392,6 +394,10 @@ hybrid intel omp"_suffix.html command can also be used within the
|
|||
input script to automatically append the "omp" suffix to styles when
|
||||
USER-INTEL styles are not available.
|
||||
|
||||
NOTE: For simulations on higher node counts, add "processors * * *
|
||||
grid numa"_processors.html" to the beginning of the input script for
|
||||
better scalability.
|
||||
|
||||
When running on many nodes, performance might be better when using
|
||||
fewer OpenMP threads and more MPI tasks. This will depend on the
|
||||
simulation and the machine. Using the "verlet/split"_run_style.html
|
||||
|
|
|
@ -14,7 +14,7 @@ read_data file keyword args ... :pre
|
|||
|
||||
file = name of data file to read in :ulb,l
|
||||
zero or more keyword/arg pairs may be appended :l
|
||||
keyword = {add} or {offset} or {shift} or {extra/atom/types} or {extra/bond/types} or {extra/angle/types} or {extra/dihedral/types} or {extra/improper/types} or {group} or {nocoeff} or {fix} :l
|
||||
keyword = {add} or {offset} or {shift} or {extra/atom/types} or {extra/bond/types} or {extra/angle/types} or {extra/dihedral/types} or {extra/improper/types} or {extra/bond/per/atom} or {extra/angle/per/atom} or {extra/dihedral/per/atom} or {extra/improper/per/atom} or {group} or {nocoeff} or {fix} :l
|
||||
{add} arg = {append} or {Nstart} or {merge}
|
||||
append = add new atoms with IDs appended to current IDs
|
||||
Nstart = add new atoms with IDs starting with Nstart
|
||||
|
|
|
@ -32,7 +32,7 @@ bond_coeff 1 10.0 1.2
|
|||
# need to preserve 1-3, 1-4 pairwise interactions during hard collisions
|
||||
|
||||
special_bonds lj/coul 0 1 1
|
||||
create_bonds all all 1 1.0 1.5
|
||||
create_bonds many all all 1 1.0 1.5
|
||||
|
||||
neighbor 0.3 bin
|
||||
neigh_modify delay 0 every 1 check yes
|
||||
|
|
|
@ -32,7 +32,7 @@ pair_coeff 1 1 10.0 1.0 2.5
|
|||
bond_style harmonic
|
||||
bond_coeff 1 10.0 1.2
|
||||
|
||||
create_bonds all all 1 1.0 1.5
|
||||
create_bonds many all all 1 1.0 1.5
|
||||
|
||||
neighbor 0.3 bin
|
||||
neigh_modify delay 0 every 1 check yes
|
||||
|
|
|
@ -748,7 +748,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
|
|||
if (eatom) {
|
||||
double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma novector
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
f[i].x += f_in[ii].x;
|
||||
|
@ -762,7 +763,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
|
|||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma novector
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
f[i].x += f_in[ii].x;
|
||||
|
@ -778,7 +780,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
|
|||
if (eatom) {
|
||||
double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma novector
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
f[i].x += f_in[i].x;
|
||||
|
@ -788,7 +791,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
|
|||
}
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma novector
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
f[i].x += f_in[i].x;
|
||||
|
|
|
@ -172,6 +172,10 @@ class IntelBuffers {
|
|||
|
||||
inline void thr_pack(const int ifrom, const int ito, const int ago) {
|
||||
if (ago == 0) {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
_x[i].x = lmp->atom->x[i][0];
|
||||
_x[i].y = lmp->atom->x[i][1];
|
||||
|
@ -179,9 +183,17 @@ class IntelBuffers {
|
|||
_x[i].w = lmp->atom->type[i];
|
||||
}
|
||||
if (lmp->atom->q != NULL)
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++)
|
||||
_q[i] = lmp->atom->q[i];
|
||||
} else {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
_x[i].x = lmp->atom->x[i][0];
|
||||
_x[i].y = lmp->atom->x[i][1];
|
||||
|
@ -204,7 +216,10 @@ class IntelBuffers {
|
|||
const int offset, const bool dotype = false) {
|
||||
double ** x = lmp->atom->x + offset;
|
||||
if (dotype == false) {
|
||||
#pragma vector nontemporal
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
_x[i].x = x[i][0];
|
||||
_x[i].y = x[i][1];
|
||||
|
@ -212,7 +227,10 @@ class IntelBuffers {
|
|||
}
|
||||
} else {
|
||||
int *type = lmp->atom->type + offset;
|
||||
#pragma vector nontemporal
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
_x[i].x = x[i][0];
|
||||
_x[i].y = x[i][1];
|
||||
|
@ -225,6 +243,10 @@ class IntelBuffers {
|
|||
inline void thr_pack_host(const int ifrom, const int ito,
|
||||
const int offset) {
|
||||
double ** x = lmp->atom->x + offset;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma vector aligned
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i = ifrom; i < ito; i++) {
|
||||
_host_x[i].x = x[i][0];
|
||||
_host_x[i].y = x[i][1];
|
||||
|
|
|
@ -68,7 +68,7 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
|
|||
#define INTEL_MAX_STENCIL 256
|
||||
// INTEL_MAX_STENCIL * sqrt(INTEL_MAX_STENCIL)
|
||||
#define INTEL_MAX_STENCIL_CHECK 4096
|
||||
#define INTEL_P3M_MAXORDER 7
|
||||
#define INTEL_P3M_MAXORDER 8
|
||||
#define INTEL_P3M_ALIGNED_MAXORDER 8
|
||||
// PRECOMPUTE VALUES IN TABLE (DOESN'T AFFECT ACCURACY)
|
||||
#define INTEL_P3M_TABLE 1
|
||||
|
|
|
@ -885,21 +885,22 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
FFT_SCALAR z0 = fdelvolinv * q[i];
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int n = 0; n < order; n++) {
|
||||
int mz = n*nix*niy + nzsum;
|
||||
FFT_SCALAR y0 = z0*rho[2][n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int m = 0; m < order; m++) {
|
||||
int mzy = m*nix + mz;
|
||||
FFT_SCALAR x0 = y0*rho[1][m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mzyx = l + mzy;
|
||||
my_density[mzyx] += x0*rho[0][l];
|
||||
}
|
||||
|
@ -1034,21 +1035,22 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
FFT_SCALAR z0 = fdelvolinv * B[type];
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int n = 0; n < order_6; n++) {
|
||||
int mz = n*nix*niy + nzsum;
|
||||
FFT_SCALAR y0 = z0*rho[2][n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int m = 0; m < order_6; m++) {
|
||||
int mzy = m*nix + mz;
|
||||
FFT_SCALAR x0 = y0*rho[1][m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mzyx = l + mzy;
|
||||
my_density[mzyx] += x0*rho[0][l];
|
||||
}
|
||||
|
@ -1181,21 +1183,22 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
FFT_SCALAR z0 = fdelvolinv;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int n = 0; n < order_6; n++) {
|
||||
int mz = n + nzsum;
|
||||
FFT_SCALAR y0 = z0*rho[2][n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int m = 0; m < order_6; m++) {
|
||||
int my = m + nysum;
|
||||
FFT_SCALAR x0 = y0*rho[1][m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l + nxsum;
|
||||
FFT_SCALAR w = x0*rho[0][l];
|
||||
density_brick_a0[mz][my][mx] += w*B[7*type];
|
||||
|
@ -1314,21 +1317,22 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
FFT_SCALAR z0 = fdelvolinv;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int n = 0; n < order_6; n++) {
|
||||
int mz = n*nix*niy + nzsum;
|
||||
FFT_SCALAR y0 = z0*rho[2][n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int m = 0; m < order_6; m++) {
|
||||
int mzy = m*nix + mz;
|
||||
FFT_SCALAR x0 = y0*rho[1][m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mzyx = l + mzy;
|
||||
FFT_SCALAR w0 = x0*rho[0][l];
|
||||
for(int k = 0; k < nsplit; k++)
|
||||
|
@ -1462,21 +1466,22 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
_alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int n = 0; n < order; n++) {
|
||||
int mz = n+nzsum;
|
||||
FFT_SCALAR z0 = rho2[n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int m = 0; m < order; m++) {
|
||||
int my = m+nysum;
|
||||
FFT_SCALAR y0 = z0*rho1[m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l+nxsum;
|
||||
FFT_SCALAR x0 = y0*rho0[l];
|
||||
ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
|
||||
|
@ -1490,12 +1495,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
FFT_SCALAR ekx, eky, ekz;
|
||||
ekx = eky = ekz = ZEROF;
|
||||
|
||||
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
ekx += ekx_arr[l];
|
||||
eky += eky_arr[l];
|
||||
ekz += ekz_arr[l];
|
||||
}
|
||||
for (int l = 0; l < order; l++) {
|
||||
ekx += ekx_arr[l];
|
||||
eky += eky_arr[l];
|
||||
ekz += ekz_arr[l];
|
||||
}
|
||||
|
||||
// convert E-field to force
|
||||
|
||||
|
@ -1643,12 +1647,12 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
|
||||
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int n = 0; n < order; n++) {
|
||||
int mz = n + nzsum;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int m = 0; m < order; m++) {
|
||||
int my = m + nysum;
|
||||
|
@ -1656,9 +1660,10 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
||||
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l + nxsum;
|
||||
ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
|
||||
eky[l] += rho[0][l] * eky_p * u_brick[mz][my][mx];
|
||||
|
@ -1668,9 +1673,9 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma simd
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
|
||||
for (int l = 0; l < order; l++){
|
||||
particle_ekx[i] += ekx[l];
|
||||
particle_eky[i] += eky[l];
|
||||
particle_ekz[i] += ekz[l];
|
||||
|
@ -1809,21 +1814,22 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
_alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int n = 0; n < order_6; n++) {
|
||||
int mz = n+nzsum;
|
||||
FFT_SCALAR z0 = rho2[n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int m = 0; m < order_6; m++) {
|
||||
int my = m+nysum;
|
||||
FFT_SCALAR y0 = z0*rho1[m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l+nxsum;
|
||||
FFT_SCALAR x0 = y0*rho0[l];
|
||||
ekx_arr[l] -= x0*vdx_brick_g[mz][my][mx];
|
||||
|
@ -1837,12 +1843,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
FFT_SCALAR ekx, eky, ekz;
|
||||
ekx = eky = ekz = ZEROF;
|
||||
|
||||
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
ekx += ekx_arr[l];
|
||||
eky += eky_arr[l];
|
||||
ekz += ekz_arr[l];
|
||||
}
|
||||
for (int l = 0; l < order; l++) {
|
||||
ekx += ekx_arr[l];
|
||||
eky += eky_arr[l];
|
||||
ekz += ekz_arr[l];
|
||||
}
|
||||
|
||||
// convert E-field to force
|
||||
|
||||
|
@ -1985,12 +1990,12 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
|
||||
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int n = 0; n < order_6; n++) {
|
||||
int mz = n + nzsum;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int m = 0; m < order_6; m++) {
|
||||
int my = m + nysum;
|
||||
|
@ -1998,9 +2003,10 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
||||
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l + nxsum;
|
||||
ekx[l] += drho[0][l] * ekx_p * u_brick_g[mz][my][mx];
|
||||
eky[l] += rho[0][l] * eky_p * u_brick_g[mz][my][mx];
|
||||
|
@ -2010,9 +2016,9 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma simd
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
|
||||
for (int l = 0; l < order; l++){
|
||||
particle_ekx[i] += ekx[l];
|
||||
particle_eky[i] += eky[l];
|
||||
particle_ekz[i] += ekz[l];
|
||||
|
@ -2168,21 +2174,22 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int n = 0; n < order_6; n++) {
|
||||
int mz = n+nzsum;
|
||||
FFT_SCALAR z0 = rho2[n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int m = 0; m < order_6; m++) {
|
||||
int my = m+nysum;
|
||||
FFT_SCALAR y0 = z0*rho1[m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l+nxsum;
|
||||
FFT_SCALAR x0 = y0*rho0[l];
|
||||
ekx0_arr[l] -= x0*vdx_brick_a0[mz][my][mx];
|
||||
|
@ -2221,29 +2228,29 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
ekx5 = eky5 = ekz5 = ZEROF;
|
||||
ekx6 = eky6 = ekz6 = ZEROF;
|
||||
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
ekx0 += ekx0_arr[l];
|
||||
eky0 += eky0_arr[l];
|
||||
ekz0 += ekz0_arr[l];
|
||||
ekx1 += ekx1_arr[l];
|
||||
eky1 += eky1_arr[l];
|
||||
ekz1 += ekz1_arr[l];
|
||||
ekx2 += ekx2_arr[l];
|
||||
eky2 += eky2_arr[l];
|
||||
ekz2 += ekz2_arr[l];
|
||||
ekx3 += ekx3_arr[l];
|
||||
eky3 += eky3_arr[l];
|
||||
ekz3 += ekz3_arr[l];
|
||||
ekx4 += ekx4_arr[l];
|
||||
eky4 += eky4_arr[l];
|
||||
ekz4 += ekz4_arr[l];
|
||||
ekx5 += ekx5_arr[l];
|
||||
eky5 += eky5_arr[l];
|
||||
ekz5 += ekz5_arr[l];
|
||||
ekx6 += ekx6_arr[l];
|
||||
eky6 += eky6_arr[l];
|
||||
ekz6 += ekz6_arr[l];
|
||||
}
|
||||
for (int l = 0; l < order; l++) {
|
||||
ekx0 += ekx0_arr[l];
|
||||
eky0 += eky0_arr[l];
|
||||
ekz0 += ekz0_arr[l];
|
||||
ekx1 += ekx1_arr[l];
|
||||
eky1 += eky1_arr[l];
|
||||
ekz1 += ekz1_arr[l];
|
||||
ekx2 += ekx2_arr[l];
|
||||
eky2 += eky2_arr[l];
|
||||
ekz2 += ekz2_arr[l];
|
||||
ekx3 += ekx3_arr[l];
|
||||
eky3 += eky3_arr[l];
|
||||
ekz3 += ekz3_arr[l];
|
||||
ekx4 += ekx4_arr[l];
|
||||
eky4 += eky4_arr[l];
|
||||
ekz4 += ekz4_arr[l];
|
||||
ekx5 += ekx5_arr[l];
|
||||
eky5 += eky5_arr[l];
|
||||
ekz5 += ekz5_arr[l];
|
||||
ekx6 += ekx6_arr[l];
|
||||
eky6 += eky6_arr[l];
|
||||
ekz6 += ekz6_arr[l];
|
||||
}
|
||||
|
||||
// convert D-field to force
|
||||
|
||||
|
@ -2439,12 +2446,12 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
particle_ekx6[i] = particle_eky6[i] = particle_ekz6[i] = ZEROF;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int n = 0; n < order_6; n++) {
|
||||
int mz = n + nzsum;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int m = 0; m < order_6; m++) {
|
||||
int my = m + nysum;
|
||||
|
@ -2452,9 +2459,10 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
||||
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l + nxsum;
|
||||
FFT_SCALAR x0 = drho[0][l] * ekx_p;
|
||||
FFT_SCALAR y0 = rho[0][l] * eky_p;
|
||||
|
@ -2486,9 +2494,9 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma simd
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
|
||||
for (int l = 0; l < order; l++){
|
||||
particle_ekx0[i] += ekx0[l];
|
||||
particle_eky0[i] += eky0[l];
|
||||
particle_ekz0[i] += ekz0[l];
|
||||
|
@ -2681,21 +2689,22 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
|
||||
for (int k = 0; k < nsplit; k++) {
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int n = 0; n < order_6; n++) {
|
||||
int mz = n+nzsum;
|
||||
FFT_SCALAR z0 = rho2[n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int m = 0; m < order_6; m++) {
|
||||
int my = m+nysum;
|
||||
FFT_SCALAR y0 = z0*rho1[m];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l+nxsum;
|
||||
FFT_SCALAR x0 = y0*rho0[l];
|
||||
ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
|
||||
|
@ -2716,13 +2725,13 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
ekx[k] = eky[k] = ekz[k] = ZEROF;
|
||||
}
|
||||
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
for (int k = 0; k < nsplit; k++) {
|
||||
ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
|
||||
eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
|
||||
ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
|
||||
}
|
||||
}
|
||||
for (int l = 0; l < order; l++) {
|
||||
for (int k = 0; k < nsplit; k++) {
|
||||
ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
|
||||
eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
|
||||
ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
|
||||
}
|
||||
}
|
||||
|
||||
// convert E-field to force
|
||||
|
||||
|
@ -2867,12 +2876,12 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
for (int k = 0; k < nsplit; k++) {
|
||||
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int n = 0; n < order_6; n++) {
|
||||
int mz = n + nzsum;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int m = 0; m < order_6; m++) {
|
||||
int my = m + nysum;
|
||||
|
@ -2880,9 +2889,10 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
|
||||
FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#pragma simd
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
for (int l = 0; l < order; l++) {
|
||||
int mx = l + nxsum;
|
||||
ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l] += drho[0][l] * ekx_p *
|
||||
u_brick_none[k][mz][my][mx];
|
||||
|
@ -2903,9 +2913,9 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma simd
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
|
||||
for (int l = 0; l < order; l++){
|
||||
for (int k = 0; k < nsplit; k++) {
|
||||
ekx_tot[k] += ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l];
|
||||
eky_tot[k] += eky[k*INTEL_P3M_ALIGNED_MAXORDER+l];
|
||||
|
|
|
@ -149,13 +149,13 @@ void PPPMIntel::init()
|
|||
memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
|
||||
memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
|
||||
memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
|
||||
memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
|
||||
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
|
||||
"pppmintel:vdxy_brick");
|
||||
create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
|
||||
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
|
||||
"pppmintel:vdxy_brick");
|
||||
memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
|
||||
memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
|
||||
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
|
||||
"pppmintel:vdz0_brick");
|
||||
create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
|
||||
nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
|
||||
"pppmintel:vdz0_brick");
|
||||
memory->destroy(work3);
|
||||
memory->create(work3, 2*nfft_both, "pppmintel:work3");
|
||||
|
||||
|
@ -555,13 +555,13 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
FFT_SCALAR z0 = fdelvolinv * q[i];
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int n = 0; n < order; n++) {
|
||||
int mz = n*nix*niy + nzsum;
|
||||
FFT_SCALAR y0 = z0*rho[2][n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int m = 0; m < order; m++) {
|
||||
int mzy = m*nix + mz;
|
||||
|
@ -708,13 +708,13 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
_alignvar(FFT_SCALAR ekz0_arr[2 * INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int n = 0; n < order; n++) {
|
||||
int mz = n+nzsum;
|
||||
FFT_SCALAR z0 = rho2[n];
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int m = 0; m < order; m++) {
|
||||
int my = m+nysum;
|
||||
|
@ -742,13 +742,13 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
ekx = eky = ekz = ZEROF;
|
||||
|
||||
if (use_packing) {
|
||||
for (int l = 0; l < 2*INTEL_P3M_ALIGNED_MAXORDER; l += 2) {
|
||||
for (int l = 0; l < 2*order; l += 2) {
|
||||
ekx += ekxy_arr[l];
|
||||
eky += ekxy_arr[l+1];
|
||||
ekz += ekz0_arr[l];
|
||||
}
|
||||
} else {
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
|
||||
for (int l = 0; l < order; l++) {
|
||||
ekx += ekx_arr[l];
|
||||
eky += eky_arr[l];
|
||||
ekz += ekz_arr[l];
|
||||
|
@ -896,12 +896,12 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int n = 0; n < order; n++) {
|
||||
int mz = n + nzsum;
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma loop_count=7
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int m = 0; m < order; m++) {
|
||||
int my = m + nysum;
|
||||
|
@ -921,9 +921,9 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
|
|||
}
|
||||
|
||||
#if defined(LMP_SIMD_COMPILER)
|
||||
#pragma simd
|
||||
#pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
|
||||
#endif
|
||||
for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
|
||||
for (int l = 0; l < order; l++){
|
||||
particle_ekx[i] += ekx[l];
|
||||
particle_eky[i] += eky[l];
|
||||
particle_ekz[i] += ekz[l];
|
||||
|
@ -1240,6 +1240,73 @@ void PPPMIntel::pack_buffers()
|
|||
fix->stop_watch(TIME_PACK);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Allocate density_brick with extra padding for vector writes
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PPPMIntel::allocate()
|
||||
{
|
||||
PPPM::allocate();
|
||||
memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
|
||||
create3d_offset(density_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
|
||||
nxlo_out,nxhi_out,"pppm:density_brick");
|
||||
|
||||
if (differentiation_flag == 1) {
|
||||
memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out);
|
||||
create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
|
||||
nxlo_out,nxhi_out,"pppm:u_brick");
|
||||
} else {
|
||||
memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
|
||||
memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
|
||||
memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
|
||||
create3d_offset(vdx_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
|
||||
nxlo_out,nxhi_out,"pppm:vdx_brick");
|
||||
create3d_offset(vdy_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
|
||||
nxlo_out,nxhi_out,"pppm:vdy_brick");
|
||||
create3d_offset(vdz_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
|
||||
nxlo_out,nxhi_out,"pppm:vdz_brick");
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Create 3D-offset allocation with extra padding for vector writes
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
FFT_SCALAR *** PPPMIntel::create3d_offset(FFT_SCALAR ***&array, int n1lo,
|
||||
int n1hi, int n2lo, int n2hi,
|
||||
int n3lo, int n3hi,
|
||||
const char *name)
|
||||
{
|
||||
int n1 = n1hi - n1lo + 1;
|
||||
int n2 = n2hi - n2lo + 1;
|
||||
int n3 = n3hi - n3lo + 1;
|
||||
|
||||
bigint nbytes = ((bigint) sizeof(FFT_SCALAR)) * n1*n2*n3 +
|
||||
INTEL_P3M_ALIGNED_MAXORDER*2;
|
||||
FFT_SCALAR *data = (FFT_SCALAR *) memory->smalloc(nbytes,name);
|
||||
nbytes = ((bigint) sizeof(FFT_SCALAR *)) * n1*n2;
|
||||
FFT_SCALAR **plane = (FFT_SCALAR **) memory->smalloc(nbytes,name);
|
||||
nbytes = ((bigint) sizeof(FFT_SCALAR **)) * n1;
|
||||
array = (FFT_SCALAR ***) memory->smalloc(nbytes,name);
|
||||
|
||||
bigint m;
|
||||
bigint n = 0;
|
||||
for (int i = 0; i < n1; i++) {
|
||||
m = ((bigint) i) * n2;
|
||||
array[i] = &plane[m];
|
||||
for (int j = 0; j < n2; j++) {
|
||||
plane[m+j] = &data[n];
|
||||
n += n3;
|
||||
}
|
||||
}
|
||||
|
||||
m = ((bigint) n1) * n2;
|
||||
for (bigint i = 0; i < m; i++) array[0][i] -= n3lo;
|
||||
for (int i = 0; i < n1; i++) array[i] -= n2lo;
|
||||
array -= n1lo;
|
||||
return array;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Returns 0 if Intel optimizations for PPPM ignored due to offload
|
||||
------------------------------------------------------------------------- */
|
||||
|
|
|
@ -74,9 +74,10 @@ class PPPMIntel : public PPPM {
|
|||
int _use_base;
|
||||
#endif
|
||||
|
||||
template<class flt_t, class acc_t>
|
||||
void test_function(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
virtual void allocate();
|
||||
|
||||
template<class flt_t, class acc_t>
|
||||
void test_function(IntelBuffers<flt_t,acc_t> *buffers);
|
||||
|
||||
void precompute_rho();
|
||||
template<class flt_t, class acc_t>
|
||||
|
@ -120,6 +121,8 @@ class PPPMIntel : public PPPM {
|
|||
fieldforce_ad<flt_t,acc_t,0>(buffers);
|
||||
}
|
||||
}
|
||||
FFT_SCALAR ***create3d_offset(FFT_SCALAR ***&, int, int, int,
|
||||
int, int, int, const char *name);
|
||||
};
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue