git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12406 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2014-09-05 14:42:44 +00:00
parent f6c0d4fec3
commit 777e82995d
35 changed files with 340 additions and 375 deletions

View File

@ -81,7 +81,7 @@ void PairEAMOpt::eval()
int i,j,ii,jj,inum,jnum,itype,jtype;
double evdwl = 0.0;
double* __restrict__ coeff;
double* _noalias coeff;
// grow energy array if necessary
@ -93,13 +93,13 @@ void PairEAMOpt::eval()
fp = (double *) memory->smalloc(nmax*sizeof(double),"pair:fp");
}
double** __restrict__ x = atom->x;
double** __restrict__ f = atom->f;
int* __restrict__ type = atom->type;
double** _noalias x = atom->x;
double** _noalias f = atom->f;
int* _noalias type = atom->type;
int nlocal = atom->nlocal;
vec3_t* __restrict__ xx = (vec3_t*)x[0];
vec3_t* __restrict__ ff = (vec3_t*)f[0];
vec3_t* _noalias xx = (vec3_t*)x[0];
vec3_t* _noalias ff = (vec3_t*)f[0];
double tmp_cutforcesq = cutforcesq;
double tmp_rdr = rdr;
@ -107,17 +107,17 @@ void PairEAMOpt::eval()
int nr1 = nr-1;
inum = list->inum;
int* __restrict__ ilist = list->ilist;
int** __restrict__ firstneigh = list->firstneigh;
int* __restrict__ numneigh = list->numneigh;
int* _noalias ilist = list->ilist;
int** _noalias firstneigh = list->firstneigh;
int* _noalias numneigh = list->numneigh;
int ntypes = atom->ntypes;
int ntypes2 = ntypes*ntypes;
fast_alpha_t* __restrict__ fast_alpha =
fast_alpha_t* _noalias fast_alpha =
(fast_alpha_t*) malloc(ntypes2*(nr+1)*sizeof(fast_alpha_t));
for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
fast_alpha_t* __restrict__ tab = &fast_alpha[i*ntypes*nr+j*nr];
fast_alpha_t* _noalias tab = &fast_alpha[i*ntypes*nr+j*nr];
if (type2rhor[i+1][j+1] >= 0) {
for(int m = 1; m <= nr; m++) {
tab[m].rhor0i = rhor_spline[type2rhor[i+1][j+1]][m][6];
@ -135,12 +135,12 @@ void PairEAMOpt::eval()
}
}
}
fast_alpha_t* __restrict__ tabeight = fast_alpha;
fast_alpha_t* _noalias tabeight = fast_alpha;
fast_gamma_t* __restrict__ fast_gamma =
fast_gamma_t* _noalias fast_gamma =
(fast_gamma_t*) malloc(ntypes2*(nr+1)*sizeof(fast_gamma_t));
for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
fast_gamma_t* __restrict__ tab = &fast_gamma[i*ntypes*nr+j*nr];
fast_gamma_t* _noalias tab = &fast_gamma[i*ntypes*nr+j*nr];
if (type2rhor[i+1][j+1] >= 0) {
for(int m = 1; m <= nr; m++) {
tab[m].rhor4i = rhor_spline[type2rhor[i+1][j+1]][m][2];
@ -168,7 +168,7 @@ void PairEAMOpt::eval()
}
}
}
fast_gamma_t* __restrict__ tabss = fast_gamma;
fast_gamma_t* _noalias tabss = fast_gamma;
// zero out density
@ -188,11 +188,11 @@ void PairEAMOpt::eval()
double ytmp = xx[i].y;
double ztmp = xx[i].z;
itype = type[i] - 1;
int* __restrict__ jlist = firstneigh[i];
int* _noalias jlist = firstneigh[i];
jnum = numneigh[i];
double tmprho = rho[i];
fast_alpha_t* __restrict__ tabeighti = &tabeight[itype*ntypes*nr];
fast_alpha_t* _noalias tabeighti = &tabeight[itype*ntypes*nr];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];
@ -265,14 +265,14 @@ void PairEAMOpt::eval()
double ytmp = xx[i].y;
double ztmp = xx[i].z;
int itype1 = type[i] - 1;
int* __restrict__ jlist = firstneigh[i];
int* _noalias jlist = firstneigh[i];
jnum = numneigh[i];
double tmpfx = 0.0;
double tmpfy = 0.0;
double tmpfz = 0.0;
fast_gamma_t* __restrict__ tabssi = &tabss[itype1*ntypes*nr];
fast_gamma_t* _noalias tabssi = &tabss[itype1*ntypes*nr];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];

View File

@ -84,22 +84,22 @@ void PairLJCharmmCoulLongOpt::eval()
double evdwl = 0.0;
double ecoul = 0.0;
double** __restrict__ x = atom->x;
double** __restrict__ f = atom->f;
double* __restrict__ q = atom->q;
int* __restrict__ type = atom->type;
double** _noalias x = atom->x;
double** _noalias f = atom->f;
double* _noalias q = atom->q;
int* _noalias type = atom->type;
int nlocal = atom->nlocal;
double* __restrict__ special_coul = force->special_coul;
double* __restrict__ special_lj = force->special_lj;
double* _noalias special_coul = force->special_coul;
double* _noalias special_lj = force->special_lj;
double qqrd2e = force->qqrd2e;
inum = list->inum;
int* __restrict__ ilist = list->ilist;
int** __restrict__ firstneigh = list->firstneigh;
int* __restrict__ numneigh = list->numneigh;
int* _noalias ilist = list->ilist;
int** _noalias firstneigh = list->firstneigh;
int* _noalias numneigh = list->numneigh;
vec3_t* __restrict__ xx = (vec3_t*)x[0];
vec3_t* __restrict__ ff = (vec3_t*)f[0];
vec3_t* _noalias xx = (vec3_t*)x[0];
vec3_t* _noalias ff = (vec3_t*)f[0];
int ntypes = atom->ntypes;
int ntypes2 = ntypes*ntypes;
@ -107,7 +107,7 @@ void PairLJCharmmCoulLongOpt::eval()
double tmp_coef1 = 1.0/denom_lj;
double tmp_coef2 = cut_ljsq - 3.0*cut_lj_innersq;
fast_alpha_t* __restrict__ fast_alpha =
fast_alpha_t* _noalias fast_alpha =
(fast_alpha_t*)malloc(ntypes2*sizeof(fast_alpha_t));
for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
fast_alpha_t& a = fast_alpha[i*ntypes+j];
@ -117,7 +117,7 @@ void PairLJCharmmCoulLongOpt::eval()
a.lj3 = lj3[i+1][j+1];
a.lj4 = lj4[i+1][j+1];
}
fast_alpha_t* __restrict__ tabsix = fast_alpha;
fast_alpha_t* _noalias tabsix = fast_alpha;
// loop over neighbors of my atoms
@ -128,14 +128,14 @@ void PairLJCharmmCoulLongOpt::eval()
double ytmp = xx[i].y;
double ztmp = xx[i].z;
itype = type[i] - 1;
int* __restrict__ jlist = firstneigh[i];
int* _noalias jlist = firstneigh[i];
jnum = numneigh[i];
double tmpfx = 0.0;
double tmpfy = 0.0;
double tmpfz = 0.0;
fast_alpha_t* __restrict__ tabsixi = (fast_alpha_t*) &tabsix[itype*ntypes];
fast_alpha_t* _noalias tabsixi = (fast_alpha_t*) &tabsix[itype*ntypes];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];

View File

@ -67,24 +67,24 @@ void PairLJCutOpt::eval()
double factor_lj;
double evdwl = 0.0;
double** __restrict__ x = atom->x;
double** __restrict__ f = atom->f;
int* __restrict__ type = atom->type;
double** _noalias x = atom->x;
double** _noalias f = atom->f;
int* _noalias type = atom->type;
int nlocal = atom->nlocal;
double* __restrict__ special_lj = force->special_lj;
double* _noalias special_lj = force->special_lj;
inum = list->inum;
int* __restrict__ ilist = list->ilist;
int** __restrict__ firstneigh = list->firstneigh;
int* __restrict__ numneigh = list->numneigh;
int* _noalias ilist = list->ilist;
int** _noalias firstneigh = list->firstneigh;
int* _noalias numneigh = list->numneigh;
vec3_t* __restrict__ xx = (vec3_t*)x[0];
vec3_t* __restrict__ ff = (vec3_t*)f[0];
vec3_t* _noalias xx = (vec3_t*)x[0];
vec3_t* _noalias ff = (vec3_t*)f[0];
int ntypes = atom->ntypes;
int ntypes2 = ntypes*ntypes;
fast_alpha_t* __restrict__ fast_alpha =
fast_alpha_t* _noalias fast_alpha =
(fast_alpha_t*) malloc(ntypes2*sizeof(fast_alpha_t));
for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
fast_alpha_t& a = fast_alpha[i*ntypes+j];
@ -95,7 +95,7 @@ void PairLJCutOpt::eval()
a.lj4 = lj4[i+1][j+1];
a.offset = offset[i+1][j+1];
}
fast_alpha_t* __restrict__ tabsix = fast_alpha;
fast_alpha_t* _noalias tabsix = fast_alpha;
// loop over neighbors of my atoms
@ -105,14 +105,14 @@ void PairLJCutOpt::eval()
double ytmp = xx[i].y;
double ztmp = xx[i].z;
itype = type[i] - 1;
int* __restrict__ jlist = firstneigh[i];
int* _noalias jlist = firstneigh[i];
jnum = numneigh[i];
double tmpfx = 0.0;
double tmpfy = 0.0;
double tmpfz = 0.0;
fast_alpha_t* __restrict__ tabsixi = (fast_alpha_t*)&tabsix[itype*ntypes];
fast_alpha_t* _noalias tabsixi = (fast_alpha_t*)&tabsix[itype*ntypes];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];

View File

@ -68,24 +68,24 @@ void PairMorseOpt::eval()
double factor_lj;
double evdwl = 0.0;
double** __restrict__ x = atom->x;
double** __restrict__ f = atom->f;
int* __restrict__ type = atom->type;
double** _noalias x = atom->x;
double** _noalias f = atom->f;
int* _noalias type = atom->type;
int nlocal = atom->nlocal;
double* __restrict__ special_lj = force->special_lj;
double* _noalias special_lj = force->special_lj;
inum = list->inum;
int* __restrict__ ilist = list->ilist;
int** __restrict__ firstneigh = list->firstneigh;
int* __restrict__ numneigh = list->numneigh;
int* _noalias ilist = list->ilist;
int** _noalias firstneigh = list->firstneigh;
int* _noalias numneigh = list->numneigh;
vec3_t* __restrict__ xx = (vec3_t*)x[0];
vec3_t* __restrict__ ff = (vec3_t*)f[0];
vec3_t* _noalias xx = (vec3_t*)x[0];
vec3_t* _noalias ff = (vec3_t*)f[0];
int ntypes = atom->ntypes;
int ntypes2 = ntypes*ntypes;
fast_alpha_t* __restrict__ fast_alpha =
fast_alpha_t* _noalias fast_alpha =
(fast_alpha_t*) malloc(ntypes2*sizeof(fast_alpha_t));
for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
fast_alpha_t& a = fast_alpha[i*ntypes+j];
@ -96,7 +96,7 @@ void PairMorseOpt::eval()
a.d0 = d0[i+1][j+1];
a.offset = offset[i+1][j+1];
}
fast_alpha_t* __restrict__ tabsix = fast_alpha;
fast_alpha_t* _noalias tabsix = fast_alpha;
// loop over neighbors of my atoms
@ -106,14 +106,14 @@ void PairMorseOpt::eval()
double ytmp = xx[i].y;
double ztmp = xx[i].z;
itype = type[i] - 1;
int* __restrict__ jlist = firstneigh[i];
int* _noalias jlist = firstneigh[i];
jnum = numneigh[i];
double tmpfx = 0.0;
double tmpfy = 0.0;
double tmpfz = 0.0;
fast_alpha_t* __restrict__ tabsixi = (fast_alpha_t*)&tabsix[itype*ntypes];
fast_alpha_t* _noalias tabsixi = (fast_alpha_t*)&tabsix[itype*ntypes];
for (jj = 0; jj < jnum; jj++) {
j = jlist[jj];

View File

@ -38,8 +38,6 @@ Intel compilers.
-----------------------------------------------------------------------------
The files in this package must be compiled with the Intel C++
compiler, i.e. icc/icpc.
For portability reasons, vectorization directives are currently only enabled
for Intel compilers. Using other compilers may result in significantly
lower performance.

View File

@ -128,7 +128,7 @@ class FixIntel : public Fix {
protected:
int _overflow_flag[5];
__declspec(align(64)) int _off_overflow_flag[5];
_alignvar(int _off_overflow_flag[5],64);
int _allow_separate_buffers, _offload_ghost;
#ifdef _LMP_INTEL_OFFLOAD
double _balance_pair_time, _balance_other_time;
@ -155,18 +155,18 @@ class FixIntel : public Fix {
double _offload_balance, _balance_neighbor, _balance_pair, _balance_fixed;
double _timers[NUM_ITIMERS];
double _stopwatch[NUM_ITIMERS];
__declspec(align(64)) double _stopwatch_offload_neighbor[1];
__declspec(align(64)) double _stopwatch_offload_pair[1];
_alignvar(double _stopwatch_offload_neighbor[1],64);
_alignvar(double _stopwatch_offload_pair[1],64);
template <class ft, class acc_t>
inline void add_results(const ft * restrict const f_in,
const acc_t * restrict const ev_global,
inline void add_results(const ft * _noalias const f_in,
const acc_t * _noalias const ev_global,
const int eatom, const int vatom,
const int offload);
template <class ft, class acc_t>
inline void add_oresults(const ft * restrict const f_in,
const acc_t * restrict const ev_global,
inline void add_oresults(const ft * _noalias const f_in,
const acc_t * _noalias const ev_global,
const int eatom, const int vatom,
const int out_offset, const int nall);
@ -176,8 +176,8 @@ class FixIntel : public Fix {
int _im_real_space_task;
MPI_Comm _real_space_comm;
template <class ft, class acc_t>
inline void add_off_results(const ft * restrict const f_in,
const acc_t * restrict const ev_global);
inline void add_off_results(const ft * _noalias const f_in,
const acc_t * _noalias const ev_global);
#endif
};
@ -284,8 +284,8 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
/* ---------------------------------------------------------------------- */
template <class ft, class acc_t>
void FixIntel::add_results(const ft * restrict const f_in,
const acc_t * restrict const ev_global,
void FixIntel::add_results(const ft * _noalias const f_in,
const acc_t * _noalias const ev_global,
const int eatom, const int vatom,
const int offload) {
start_watch(TIME_PACK);
@ -295,7 +295,7 @@ void FixIntel::add_results(const ft * restrict const f_in,
if (offload) {
add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
if (force->newton_pair) {
const acc_t * restrict const enull = 0;
const acc_t * _noalias const enull = 0;
int offset = _offload_nlocal;
if (atom->torque) offset *= 2;
add_oresults(f_in + offset, enull, eatom, vatom,
@ -305,7 +305,7 @@ void FixIntel::add_results(const ft * restrict const f_in,
add_oresults(f_in, ev_global, eatom, vatom,
_host_min_local, _host_used_local);
if (force->newton_pair) {
const acc_t * restrict const enull = 0;
const acc_t * _noalias const enull = 0;
int offset = _host_used_local;
if (atom->torque) offset *= 2;
add_oresults(f_in + offset, enull, eatom,
@ -333,11 +333,11 @@ void FixIntel::add_results(const ft * restrict const f_in,
/* ---------------------------------------------------------------------- */
template <class ft, class acc_t>
void FixIntel::add_oresults(const ft * restrict const f_in,
const acc_t * restrict const ev_global,
void FixIntel::add_oresults(const ft * _noalias const f_in,
const acc_t * _noalias const ev_global,
const int eatom, const int vatom,
const int out_offset, const int nall) {
lmp_ft * restrict const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
if (atom->torque) {
if (f_in[1].w)
if (f_in[1].w == 1)
@ -351,12 +351,16 @@ void FixIntel::add_oresults(const ft * restrict const f_in,
#pragma omp parallel default(none)
#endif
{
#if defined(_OPENMP)
const int tid = omp_get_thread_num();
#else
const int tid = 0;
#endif
int ifrom, ito;
IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t));
if (atom->torque) {
int ii = ifrom * 2;
lmp_ft * restrict const tor = (lmp_ft *) lmp->atom->torque[0] +
lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
out_offset;
if (eatom) {
for (int i = ifrom; i < ito; i++) {
@ -440,6 +444,7 @@ void FixIntel::balance_stamp() {
/* ---------------------------------------------------------------------- */
void FixIntel::acc_timers() {
_timers[TIME_OFFLOAD_PAIR] += *_stopwatch_offload_pair;
if (neighbor->ago == 0) {
_timers[TIME_OFFLOAD_NEIGHBOR] += *_stopwatch_offload_neighbor;
if (_setup_time_cleared == false) {
@ -447,7 +452,6 @@ void FixIntel::acc_timers() {
_setup_time_cleared = true;
}
}
_timers[TIME_OFFLOAD_PAIR] += *_stopwatch_offload_pair;
}
/* ---------------------------------------------------------------------- */
@ -464,8 +468,8 @@ void FixIntel::set_neighbor_host_sizes() {
/* ---------------------------------------------------------------------- */
template <class ft, class acc_t>
void FixIntel::add_off_results(const ft * restrict const f_in,
const acc_t * restrict const ev_global) {
void FixIntel::add_off_results(const ft * _noalias const f_in,
const acc_t * _noalias const ev_global) {
if (_offload_balance < 0.0)
_balance_other_time = MPI_Wtime() - _balance_other_time;

View File

@ -22,8 +22,8 @@ using namespace LAMMPS_NS;
template <class flt_t, class acc_t>
IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _buf_size(0),
_buf_local_size(0), _off_threads(0) {
lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _off_threads(0),
_buf_size(0), _buf_local_size(0) {
_list_alloc_atoms = 0;
_ntypes = 0;
_off_map_maxlocal = 0;
@ -423,6 +423,8 @@ double IntelBuffers<flt_t, acc_t>::memory_usage(const int nthreads)
tmem += _off_map_maxlocal * sizeof(int);
tmem += (_list_alloc_atoms + _off_threads) * get_max_nbors() * sizeof(int);
tmem += _ntypes * _ntypes * sizeof(int);
return tmem;
}
/* ---------------------------------------------------------------------- */

View File

@ -235,7 +235,8 @@ class IntelBuffers {
double memory_usage(const int nthreads);
int _special_holder, _nspecial_holder;
tagint _special_holder;
int _nspecial_holder;
protected:
LAMMPS *lmp;
@ -266,8 +267,8 @@ class IntelBuffers {
#endif
int _buf_size, _buf_local_size;
__declspec(align(64)) acc_t _ev_global[8];
__declspec(align(64)) acc_t _ev_global_host[8];
_alignvar(acc_t _ev_global[8],64);
_alignvar(acc_t _ev_global_host[8],64);
void _grow(const int nall, const int nlocal, const int nthreads,
const int offload_end);

View File

@ -26,6 +26,22 @@
#ifndef LAMMPS_MEMALIGN
#error Please set -DLAMMPS_MEMALIGN=64 in CCFLAGS for your LAMMPS makefile.
#else
#if (LAMMPS_MEMALIGN != 64)
#error Please set -DLAMMPS_MEMALIGN=64 in CCFLAGS for your LAMMPS makefile.
#endif
#endif
#if defined(_OPENMP)
#define _use_omp_pragma(txt) _Pragma(txt)
#else
#define _use_omp_pragma(txt)
#endif
#if defined(__INTEL_COMPILER)
#define _use_simd_pragma(txt) _Pragma(txt)
#else
#define _use_simd_pragma(txt)
#endif
namespace LAMMPS_NS {
@ -141,7 +157,7 @@ inline double MIC_Wtime() {
if (fix->separate_buffers() && ago != 0) { \
fix->start_watch(TIME_PACK); \
if (offload) { \
_Pragma("omp parallel default(none) shared(buffers,nlocal,nall)") \
_use_omp_pragma("omp parallel default(none) shared(buffers,nlocal,nall)") \
{ \
int ifrom, ito, tid; \
int nthreads = comm->nthreads; \
@ -343,15 +359,16 @@ inline double MIC_Wtime() {
else \
o_range = nlocal; \
if (offload == 0) o_range -= minlocal; \
IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads, \
IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads, \
sizeof(acc_t)); \
\
int t_off = f_stride; \
if (eflag && eatom) { \
for (int t = 1; t < nthreads; t++) { \
_Pragma("vector nontemporal") \
_use_simd_pragma("vector nontemporal") \
_use_simd_pragma("novector") \
for (int n = iifrom; n < iito; n++) { \
f_start[n].x += f_start[n + t_off].x; \
f_start[n].x += f_start[n + t_off].x; \
f_start[n].y += f_start[n + t_off].y; \
f_start[n].z += f_start[n + t_off].z; \
f_start[n].w += f_start[n + t_off].w; \
@ -360,8 +377,9 @@ inline double MIC_Wtime() {
} \
} else { \
for (int t = 1; t < nthreads; t++) { \
_Pragma("vector nontemporal") \
for (int n = iifrom; n < iito; n++) { \
_use_simd_pragma("vector nontemporal") \
_use_simd_pragma("novector") \
for (int n = iifrom; n < iito; n++) { \
f_start[n].x += f_start[n + t_off].x; \
f_start[n].y += f_start[n + t_off].y; \
f_start[n].z += f_start[n + t_off].z; \
@ -372,8 +390,9 @@ inline double MIC_Wtime() {
\
if (evflag) { \
if (vflag == 2) { \
const ATOM_T * restrict const xo = x + minlocal; \
_Pragma("vector nontemporal") \
const ATOM_T * _noalias const xo = x + minlocal; \
_use_simd_pragma("vector nontemporal") \
_use_simd_pragma("novector") \
for (int n = iifrom; n < iito; n++) { \
ov0 += f_start[n].x * xo[n].x; \
ov1 += f_start[n].y * xo[n].y; \

View File

@ -287,7 +287,7 @@
if (fabs(aug_8) > fabs(aug_0)) { \
flt_t swapt; \
swapt = aug_0; aug_0 = aug_8; aug_8 = swapt; \
swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \
swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \
swapt = aug_2; aug_2 = aug_10; aug_10 = swapt; \
swapt = aug_3; aug_3 = aug_11; aug_11 = swapt; \
} \

View File

@ -79,7 +79,7 @@ inline int mcoord2bin(const flt_t x0, const flt_t x1, const flt_t x2,
const int n1 = nspecial[i * 3]; \
const int n2 = nspecial[i * 3 + 1]; \
const int n3 = nspecial[i * 3 + 2]; \
const int *sptr = special + i * maxspecial; \
const tagint *sptr = special + i * maxspecial; \
for (int s = 0; s < n3; s++) { \
if (sptr[s] == tag) { \
if (s < n1) { \
@ -105,7 +105,7 @@ inline int mcoord2bin(const flt_t x0, const flt_t x1, const flt_t x2,
template <class flt_t, class acc_t>
void Neighbor::bin_atoms(void * xin) {
const ATOM_T * restrict const x = (const ATOM_T * restrict const)xin;
const ATOM_T * _noalias const x = (const ATOM_T * _noalias const)xin;
int nlocal = atom->nlocal;
const int nall = nlocal + atom->nghost;
@ -243,11 +243,12 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
return;
}
const ATOM_T * restrict const x = buffers->get_x();
int * restrict const firstneigh = buffers->firstneigh(list);
const ATOM_T * _noalias const x = buffers->get_x();
int * _noalias const firstneigh = buffers->firstneigh(list);
const int molecular = atom->molecular;
int *ns = NULL, *s = NULL;
int *ns = NULL;
tagint *s = NULL;
int tag_size, special_size;
if (molecular) {
s = atom->special[0];
@ -260,23 +261,23 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
tag_size = 0;
special_size = 0;
}
const int * restrict const special = s;
const int * restrict const nspecial = ns;
const tagint * _noalias const special = s;
const int * _noalias const nspecial = ns;
const int maxspecial = atom->maxspecial;
const int * restrict const tag = atom->tag;
const tagint * _noalias const tag = atom->tag;
int * restrict const ilist = list->ilist;
int * restrict numneigh = list->numneigh;
int * restrict const cnumneigh = buffers->cnumneigh(list);
int * _noalias const ilist = list->ilist;
int * _noalias numneigh = list->numneigh;
int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int nstencil = list->nstencil;
const int * restrict const stencil = list->stencil;
const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0];
const int * _noalias const stencil = list->stencil;
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
const int ntypes = atom->ntypes + 1;
const int nlocal = atom->nlocal;
#ifndef _LMP_INTEL_OFFLOAD
int * const mask = atom->mask;
int * const molecule = atom->molecule;
tagint * const molecule = atom->molecule;
#endif
int tnum;
@ -316,8 +317,8 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
}
#ifdef _LMP_INTEL_OFFLOAD
const int * restrict const binhead = this->binhead;
const int * restrict const special_flag = this->special_flag;
const int * _noalias const binhead = this->binhead;
const int * _noalias const special_flag = this->special_flag;
const int nbinx = this->nbinx;
const int nbiny = this->nbiny;
const int nbinz = this->nbinz;
@ -327,7 +328,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
const int mbinx = this->mbinx;
const int mbiny = this->mbiny;
const int mbinz = this->mbinz;
const int * restrict const bins = this->bins;
const int * _noalias const bins = this->bins;
const int cop = fix->coprocessor_number();
const int separate_buffers = fix->separate_buffers();
#pragma offload target(mic:cop) if(offload) \
@ -486,7 +487,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
if (molecular) {
for (int i = ifrom; i < ito; ++i) {
int * restrict jlist = firstneigh + cnumneigh[i];
int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
for (int jj = 0; jj < jnum; jj++) {
const int j = jlist[jj];
@ -507,7 +508,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
#ifdef _LMP_INTEL_OFFLOAD
else if (separate_buffers) {
for (int i = ifrom; i < ito; ++i) {
int * restrict jlist = firstneigh + cnumneigh[i];
int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
int jj = 0;
for (jj = 0; jj < jnum; jj++)
@ -662,14 +663,15 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
return;
}
const ATOM_T * restrict const x = buffers->get_x();
int * restrict const firstneigh = buffers->firstneigh(list);
const ATOM_T * _noalias const x = buffers->get_x();
int * _noalias const firstneigh = buffers->firstneigh(list);
int nall_t = nall;
if (offload_noghost && offload) nall_t = atom->nlocal;
const int e_nall = nall_t;
const int molecular = atom->molecular;
int *ns = NULL, *s = NULL;
int *ns = NULL;
tagint *s = NULL;
int tag_size, special_size;
if (molecular) {
s = atom->special[0];
@ -682,23 +684,23 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
tag_size = 0;
special_size = 0;
}
const int * restrict const special = s;
const int * restrict const nspecial = ns;
const tagint * _noalias const special = s;
const int * _noalias const nspecial = ns;
const int maxspecial = atom->maxspecial;
const int * restrict const tag = atom->tag;
const tagint * _noalias const tag = atom->tag;
int * restrict const ilist = list->ilist;
int * restrict numneigh = list->numneigh;
int * restrict const cnumneigh = buffers->cnumneigh(list);
int * _noalias const ilist = list->ilist;
int * _noalias numneigh = list->numneigh;
int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int nstencil = list->nstencil;
const int * restrict const stencil = list->stencil;
const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0];
const int * _noalias const stencil = list->stencil;
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
const int ntypes = atom->ntypes + 1;
const int nlocal = atom->nlocal;
#ifndef _LMP_INTEL_OFFLOAD
int * const mask = atom->mask;
int * const molecule = atom->molecule;
tagint * const molecule = atom->molecule;
#endif
int tnum;
@ -737,8 +739,8 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
}
#ifdef _LMP_INTEL_OFFLOAD
const int * restrict const binhead = this->binhead;
const int * restrict const special_flag = this->special_flag;
const int * _noalias const binhead = this->binhead;
const int * _noalias const special_flag = this->special_flag;
const int nbinx = this->nbinx;
const int nbiny = this->nbiny;
const int nbinz = this->nbinz;
@ -748,7 +750,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
const int mbinx = this->mbinx;
const int mbiny = this->mbiny;
const int mbinz = this->mbinz;
const int * restrict const bins = this->bins;
const int * _noalias const bins = this->bins;
const int cop = fix->coprocessor_number();
const int separate_buffers = fix->separate_buffers();
#pragma offload target(mic:cop) if(offload) \
@ -948,7 +950,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
if (molecular) {
for (int i = ifrom; i < ito; ++i) {
int * restrict jlist = firstneigh + cnumneigh[i];
int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
for (int jj = 0; jj < jnum; jj++) {
const int j = jlist[jj];
@ -970,7 +972,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
#ifdef _LMP_INTEL_OFFLOAD
else if (separate_buffers) {
for (int i = ifrom; i < ito; ++i) {
int * restrict jlist = firstneigh + cnumneigh[i];
int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
int jj = 0;
for (jj = 0; jj < jnum; jj++)
@ -1127,14 +1129,15 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
return;
}
const ATOM_T * restrict const x = buffers->get_x();
int * restrict const firstneigh = buffers->firstneigh(list);
const ATOM_T * _noalias const x = buffers->get_x();
int * _noalias const firstneigh = buffers->firstneigh(list);
int nall_t = nall;
if (offload_noghost && offload) nall_t = atom->nlocal;
const int e_nall = nall_t;
const int molecular = atom->molecular;
int *ns = NULL, *s = NULL;
int *ns = NULL;
tagint *s = NULL;
int tag_size, special_size;
if (molecular) {
s = atom->special[0];
@ -1147,23 +1150,23 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
tag_size = 0;
special_size = 0;
}
const int * restrict const special = s;
const int * restrict const nspecial = ns;
const tagint * _noalias const special = s;
const int * _noalias const nspecial = ns;
const int maxspecial = atom->maxspecial;
const int * restrict const tag = atom->tag;
const tagint * _noalias const tag = atom->tag;
int * restrict const ilist = list->ilist;
int * restrict numneigh = list->numneigh;
int * restrict const cnumneigh = buffers->cnumneigh(list);
int * _noalias const ilist = list->ilist;
int * _noalias numneigh = list->numneigh;
int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int nstencil = list->nstencil;
const int * restrict const stencil = list->stencil;
const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0];
const int * _noalias const stencil = list->stencil;
const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
const int ntypes = atom->ntypes + 1;
const int nlocal = atom->nlocal;
#ifndef _LMP_INTEL_OFFLOAD
int * const mask = atom->mask;
int * const molecule = atom->molecule;
tagint * const molecule = atom->molecule;
#endif
int tnum;
@ -1202,8 +1205,8 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
}
#ifdef _LMP_INTEL_OFFLOAD
const int * restrict const binhead = this->binhead;
const int * restrict const special_flag = this->special_flag;
const int * _noalias const binhead = this->binhead;
const int * _noalias const special_flag = this->special_flag;
const int nbinx = this->nbinx;
const int nbiny = this->nbiny;
const int nbinz = this->nbinz;
@ -1213,7 +1216,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
const int mbinx = this->mbinx;
const int mbiny = this->mbiny;
const int mbinz = this->mbinz;
const int * restrict const bins = this->bins;
const int * _noalias const bins = this->bins;
const int cop = fix->coprocessor_number();
const int separate_buffers = fix->separate_buffers();
#pragma offload target(mic:cop) if(offload) \
@ -1386,7 +1389,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
if (molecular) {
for (int i = ifrom; i < ito; ++i) {
int * restrict jlist = firstneigh + cnumneigh[i];
int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
for (int jj = 0; jj < jnum; jj++) {
const int j = jlist[jj];
@ -1407,7 +1410,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
#ifdef _LMP_INTEL_OFFLOAD
else if (separate_buffers) {
for (int i = ifrom; i < ito; ++i) {
int * restrict jlist = firstneigh + cnumneigh[i];
int * _noalias jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
int jj = 0;
for (jj = 0; jj < jnum; jj++)

View File

@ -79,7 +79,7 @@ void PairGayBerneIntel::compute(int eflag, int vflag,
fix->start_watch(TIME_PACK);
const AtomVecEllipsoid::Bonus * const bonus = avec->bonus;
const int * const ellipsoid = atom->ellipsoid;
QUAT_T * restrict const quat = buffers->get_quat();
QUAT_T * _noalias const quat = buffers->get_quat();
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
#endif
@ -150,8 +150,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
fix->get_buffern(offload, nlocal, nall, minlocal);
const int ago = neighbor->ago;
ATOM_T * restrict const x = buffers->get_x(offload);
QUAT_T * restrict const quat = buffers->get_quat(offload);
ATOM_T * _noalias const x = buffers->get_x(offload);
QUAT_T * _noalias const quat = buffers->get_quat(offload);
const AtomVecEllipsoid::Bonus *bonus = avec->bonus;
const int *ellipsoid = atom->ellipsoid;
@ -225,15 +225,15 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
}
#endif
// const int * restrict const ilist = list->ilist;
const int * restrict const numneigh = list->numneigh;
const int * restrict const cnumneigh = buffers->cnumneigh(list);
const int * restrict const firstneigh = buffers->firstneigh(list);
const flt_t * restrict const special_lj = fc.special_lj;
// const int * _noalias const ilist = list->ilist;
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const flt_t * _noalias const special_lj = fc.special_lj;
const FC_PACKED1_T * restrict const ijc = fc.ijc[0];
const FC_PACKED2_T * restrict const lj34 = fc.lj34[0];
const FC_PACKED3_T * restrict const ic = fc.ic;
const FC_PACKED1_T * _noalias const ijc = fc.ijc[0];
const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
const FC_PACKED3_T * _noalias const ic = fc.ic;
const flt_t mu = fc.mu;
const flt_t gamma = fc.gamma;
const flt_t upsilon = fc.upsilon;
@ -255,8 +255,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
x_size, q_size, ev_size, f_stride);
int tc;
FORCE_T * restrict f_start;
acc_t * restrict ev_global;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
const int max_nbors = _max_nbors;
const int nthreads = tc;
@ -351,25 +351,25 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
iifrom += astart;
iito += astart;
FORCE_T * restrict const f = f_start - minlocal * 2 + (tid * f_stride);
FORCE_T * _noalias const f = f_start - minlocal * 2 + (tid * f_stride);
memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
flt_t * restrict const rsq_form = rsq_formi + tid * max_nbors;
flt_t * restrict const delx_form = delx_formi + tid * max_nbors;
flt_t * restrict const dely_form = dely_formi + tid * max_nbors;
flt_t * restrict const delz_form = delz_formi + tid * max_nbors;
int * restrict const jtype_form = jtype_formi + tid * max_nbors;
int * restrict const jlist_form = jlist_formi + tid * max_nbors;
flt_t * _noalias const rsq_form = rsq_formi + tid * max_nbors;
flt_t * _noalias const delx_form = delx_formi + tid * max_nbors;
flt_t * _noalias const dely_form = dely_formi + tid * max_nbors;
flt_t * _noalias const delz_form = delz_formi + tid * max_nbors;
int * _noalias const jtype_form = jtype_formi + tid * max_nbors;
int * _noalias const jlist_form = jlist_formi + tid * max_nbors;
int ierror = 0;
for (int i = iifrom; i < iito; ++i) {
// const int i = ilist[ii];
const int itype = x[i].w;
const int ptr_off = itype * ntypes;
const FC_PACKED1_T * restrict const ijci = ijc + ptr_off;
const FC_PACKED2_T * restrict const lj34i = lj34 + ptr_off;
const FC_PACKED1_T * _noalias const ijci = ijc + ptr_off;
const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
const int * restrict const jlist = firstneigh + cnumneigh[i];
const int * _noalias const jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
const flt_t xtmp = x[i].x;
@ -433,9 +433,11 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
__assume(packed_j % 8 == 0);
__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
#endif
#if defined(__INTEL_COMPILER)
#pragma vector aligned
#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
#endif
for (int jj = 0; jj < packed_j; jj++) {
flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
flt_t b2_0, b2_1, b2_2, b2_3, b2_4, b2_5, b2_6, b2_7, b2_8;
@ -796,7 +798,10 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
int t_off = f_stride;
if (EFLAG && eatom) {
for (int t = 1; t < nthreads; t++) {
#if defined(__INTEL_COMPILER)
#pragma vector nontemporal
#pragma novector
#endif
for (int n = iifrom * 2; n < two_iito; n++) {
f_start[n].x += f_start[n + t_off].x;
f_start[n].y += f_start[n + t_off].y;
@ -807,7 +812,10 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
}
} else {
for (int t = 1; t < nthreads; t++) {
#if defined(__INTEL_COMPILER)
#pragma vector nontemporal
#pragma novector
#endif
for (int n = iifrom * 2; n < two_iito; n++) {
f_start[n].x += f_start[n + t_off].x;
f_start[n].y += f_start[n + t_off].y;
@ -819,8 +827,11 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
if (EVFLAG) {
if (vflag==2) {
const ATOM_T * restrict const xo = x + minlocal;
const ATOM_T * _noalias const xo = x + minlocal;
#if defined(__INTEL_COMPILER)
#pragma vector nontemporal
#pragma novector
#endif
for (int n = iifrom; n < iito; n++) {
const int nt2 = n * 2;
ov0 += f_start[nt2].x * xo[n].x;

View File

@ -62,7 +62,10 @@ class PairGayBerneIntel : public PairGayBerne {
typedef struct { flt_t lj3, lj4; } fc_packed2;
typedef struct { flt_t shape2[4], well[4]; } fc_packed3;
__declspec(align(64)) flt_t special_lj[4], gamma, upsilon, mu;
_alignvar(flt_t special_lj[4],64);
_alignvar(flt_t gamma,64);
_alignvar(flt_t upsilon,64);
_alignvar(flt_t mu,64);
fc_packed1 **ijc;
fc_packed2 **lj34;
fc_packed3 *ic;

View File

@ -143,25 +143,25 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
const int ago = neighbor->ago;
IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
ATOM_T * restrict const x = buffers->get_x(offload);
flt_t * restrict const q = buffers->get_q(offload);
ATOM_T * _noalias const x = buffers->get_x(offload);
flt_t * _noalias const q = buffers->get_q(offload);
const int * restrict const numneigh = list->numneigh;
const int * restrict const cnumneigh = buffers->cnumneigh(list);
const int * restrict const firstneigh = buffers->firstneigh(list);
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const flt_t * restrict const special_coul = fc.special_coul;
const flt_t * restrict const special_lj = fc.special_lj;
const flt_t * _noalias const special_coul = fc.special_coul;
const flt_t * _noalias const special_lj = fc.special_lj;
const flt_t qqrd2e = force->qqrd2e;
const flt_t inv_denom_lj = (flt_t)1.0/denom_lj;
const flt_t * restrict const cutsq = fc.cutsq[0];
const LJ_T * restrict const lj = fc.lj[0];
const TABLE_T * restrict const table = fc.table;
const flt_t * restrict const etable = fc.etable;
const flt_t * restrict const detable = fc.detable;
const flt_t * restrict const ctable = fc.ctable;
const flt_t * restrict const dctable = fc.dctable;
const flt_t * _noalias const cutsq = fc.cutsq[0];
const LJ_T * _noalias const lj = fc.lj[0];
const TABLE_T * _noalias const table = fc.table;
const flt_t * _noalias const etable = fc.etable;
const flt_t * _noalias const detable = fc.detable;
const flt_t * _noalias const ctable = fc.ctable;
const flt_t * _noalias const dctable = fc.dctable;
const flt_t cut_ljsq = fc.cut_ljsq;
const flt_t cut_lj_innersq = fc.cut_lj_innersq;
const flt_t cut_coulsq = fc.cut_coulsq;
@ -178,8 +178,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
x_size, q_size, ev_size, f_stride);
int tc;
FORCE_T * restrict f_start;
acc_t * restrict ev_global;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
const int nthreads = tc;
@ -242,7 +242,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
iifrom += astart;
iito += astart;
FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride);
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
flt_t cutboth = cut_coulsq;
@ -251,10 +251,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
const int itype = x[i].w;
const int ptr_off = itype * ntypes;
const flt_t * restrict const cutsqi = cutsq + ptr_off;
const LJ_T * restrict const lji = lj + ptr_off;
const flt_t * _noalias const cutsqi = cutsq + ptr_off;
const LJ_T * _noalias const lji = lj + ptr_off;
const int * restrict const jlist = firstneigh + cnumneigh[i];
const int * _noalias const jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
acc_t fxtmp,fytmp,fztmp,fwtmp;
@ -270,9 +270,11 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
}
#if defined(__INTEL_COMPILER)
#pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int jj = 0; jj < jnum; jj++) {
flt_t forcecoul, forcelj, evdwl, ecoul;
forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;

View File

@ -62,8 +62,8 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
class ForceConst {
public:
typedef struct { flt_t r, dr, f, df; } table_t;
__declspec(align(64)) flt_t special_coul[4];
__declspec(align(64)) flt_t special_lj[4];
_alignvar(flt_t special_coul[4],64);
_alignvar(flt_t special_lj[4],64);
flt_t **cutsq, g_ewald, tabinnersq;
flt_t cut_coulsq, cut_ljsq;
flt_t cut_lj_innersq;

View File

@ -143,24 +143,24 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
const int ago = neighbor->ago;
IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
ATOM_T * restrict const x = buffers->get_x(offload);
flt_t * restrict const q = buffers->get_q(offload);
ATOM_T * _noalias const x = buffers->get_x(offload);
flt_t * _noalias const q = buffers->get_q(offload);
const int * restrict const numneigh = list->numneigh;
const int * restrict const cnumneigh = buffers->cnumneigh(list);
const int * restrict const firstneigh = buffers->firstneigh(list);
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const flt_t * restrict const special_coul = fc.special_coul;
const flt_t * restrict const special_lj = fc.special_lj;
const flt_t * _noalias const special_coul = fc.special_coul;
const flt_t * _noalias const special_lj = fc.special_lj;
const flt_t qqrd2e = force->qqrd2e;
const C_FORCE_T * restrict const c_force = fc.c_force[0];
const C_ENERGY_T * restrict const c_energy = fc.c_energy[0];
const TABLE_T * restrict const table = fc.table;
const flt_t * restrict const etable = fc.etable;
const flt_t * restrict const detable = fc.detable;
const flt_t * restrict const ctable = fc.ctable;
const flt_t * restrict const dctable = fc.dctable;
const C_FORCE_T * _noalias const c_force = fc.c_force[0];
const C_ENERGY_T * _noalias const c_energy = fc.c_energy[0];
const TABLE_T * _noalias const table = fc.table;
const flt_t * _noalias const etable = fc.etable;
const flt_t * _noalias const detable = fc.detable;
const flt_t * _noalias const ctable = fc.ctable;
const flt_t * _noalias const dctable = fc.dctable;
const flt_t g_ewald = fc.g_ewald;
const flt_t tabinnersq = fc.tabinnersq;
@ -174,8 +174,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
x_size, q_size, ev_size, f_stride);
int tc;
FORCE_T * restrict f_start;
acc_t * restrict ev_global;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
const int nthreads = tc;
@ -237,17 +237,17 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
iifrom += astart;
iito += astart;
FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride);
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
for (int i = iifrom; i < iito; ++i) {
const int itype = x[i].w;
const int ptr_off = itype * ntypes;
const C_FORCE_T * restrict const c_forcei = c_force + ptr_off;
const C_ENERGY_T * restrict const c_energyi = c_energy + ptr_off;
const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
const int * restrict const jlist = firstneigh + cnumneigh[i];
const int * _noalias const jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
acc_t fxtmp,fytmp,fztmp,fwtmp;
@ -263,9 +263,11 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
}
#if defined(__INTEL_COMPILER)
#pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int jj = 0; jj < jnum; jj++) {
flt_t forcecoul, forcelj, evdwl, ecoul;
forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;

View File

@ -64,8 +64,8 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong {
typedef struct { flt_t cutsq, cut_ljsq, lj1, lj2; } c_force_t;
typedef struct { flt_t lj3, lj4, offset, pad; } c_energy_t;
typedef struct { flt_t r, dr, f, df; } table_t;
__declspec(align(64)) flt_t special_coul[4];
__declspec(align(64)) flt_t special_lj[4];
_alignvar(flt_t special_coul[4],64);
_alignvar(flt_t special_lj[4],64);
flt_t g_ewald, tabinnersq;
c_force_t **c_force;
c_energy_t **c_energy;

View File

@ -134,14 +134,14 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
const int ago = neighbor->ago;
IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
ATOM_T * restrict const x = buffers->get_x(offload);
ATOM_T * _noalias const x = buffers->get_x(offload);
const int * restrict const numneigh = list->numneigh;
const int * restrict const cnumneigh = buffers->cnumneigh(list);
const int * restrict const firstneigh = buffers->firstneigh(list);
const flt_t * restrict const special_lj = fc.special_lj;
const FC_PACKED1_T * restrict const ljc12o = fc.ljc12o[0];
const FC_PACKED2_T * restrict const lj34 = fc.lj34[0];
const int * _noalias const numneigh = list->numneigh;
const int * _noalias const cnumneigh = buffers->cnumneigh(list);
const int * _noalias const firstneigh = buffers->firstneigh(list);
const flt_t * _noalias const special_lj = fc.special_lj;
const FC_PACKED1_T * _noalias const ljc12o = fc.ljc12o[0];
const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
const int ntypes = atom->ntypes + 1;
const int eatom = this->eflag_atom;
@ -153,8 +153,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
x_size, q_size, ev_size, f_stride);
int tc;
FORCE_T * restrict f_start;
acc_t * restrict ev_global;
FORCE_T * _noalias f_start;
acc_t * _noalias ev_global;
IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
const int nthreads = tc;
int *overflow = fix->get_off_overflow_flag();
@ -184,17 +184,17 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
iifrom += astart;
iito += astart;
FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride);
FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
for (int i = iifrom; i < iito; ++i) {
const int itype = x[i].w;
const int ptr_off = itype * ntypes;
const FC_PACKED1_T * restrict const ljc12oi = ljc12o + ptr_off;
const FC_PACKED2_T * restrict const lj34i = lj34 + ptr_off;
const FC_PACKED1_T * _noalias const ljc12oi = ljc12o + ptr_off;
const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
const int * restrict const jlist = firstneigh + cnumneigh[i];
const int * _noalias const jlist = firstneigh + cnumneigh[i];
const int jnum = numneigh[i];
acc_t fxtmp, fytmp, fztmp, fwtmp;
@ -209,9 +209,11 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
}
#if defined(__INTEL_COMPILER)
#pragma vector aligned
#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
sv0, sv1, sv2, sv3, sv4, sv5)
#endif
for (int jj = 0; jj < jnum; jj++) {
flt_t forcelj, evdwl;
forcelj = evdwl = (flt_t)0.0;

View File

@ -62,7 +62,7 @@ class PairLJCutIntel : public PairLJCut {
typedef struct { flt_t cutsq, lj1, lj2, offset; } fc_packed1;
typedef struct { flt_t lj3, lj4; } fc_packed2;
__declspec(align(64)) flt_t special_lj[4];
_alignvar(flt_t special_lj[4],64);
fc_packed1 **ljc12o;
fc_packed2 **lj34;

View File

@ -17,6 +17,7 @@
#include "domain.h"
#include "comm.h"
#include "atom.h"
#include "atom_vec.h"
#include "force.h"
#include "pair.h"
#include "bond.h"
@ -81,14 +82,9 @@ void VerletIntel::init()
// set flags for what arrays to clear in force_clear()
// need to clear additionals arrays if they exist
torqueflag = 0;
torqueflag = extraflag = 0;
if (atom->torque_flag) torqueflag = 1;
erforceflag = 0;
if (atom->erforce_flag) erforceflag = 1;
e_flag = 0;
if (atom->e_flag) e_flag = 1;
rho_flag = 0;
if (atom->rho_flag) rho_flag = 1;
if (atom->avec->forceclearflag) extraflag = 1;
// orthogonal vs triclinic simulation box
@ -388,7 +384,7 @@ void VerletIntel::cleanup()
void VerletIntel::force_clear()
{
int i;
size_t nbytes;
if (external_force_clear) return;
@ -396,19 +392,16 @@ void VerletIntel::force_clear()
// if either newton flag is set, also include ghosts
// when using threads always clear all forces.
if (neighbor->includegroup == 0) {
int nall;
if (force->newton) nall = atom->nlocal + atom->nghost;
else nall = atom->nlocal;
int nlocal = atom->nlocal;
size_t nbytes = sizeof(double) * nall;
if (neighbor->includegroup == 0) {
nbytes = sizeof(double) * nlocal;
if (force->newton) nbytes += sizeof(double) * atom->nghost;
if (nbytes) {
memset(&(atom->f[0][0]),0,3*nbytes);
if (torqueflag) memset(&(atom->torque[0][0]),0,3*nbytes);
if (erforceflag) memset(&(atom->erforce[0]), 0, nbytes);
if (e_flag) memset(&(atom->de[0]), 0, nbytes);
if (rho_flag) memset(&(atom->drho[0]), 0, nbytes);
memset(&atom->f[0][0],0,3*nbytes);
if (torqueflag) memset(&atom->torque[0][0],0,3*nbytes);
if (extraflag) atom->avec->force_clear(0,nbytes);
}
// neighbor includegroup flag is set
@ -416,70 +409,21 @@ void VerletIntel::force_clear()
// if either newton flag is set, also include ghosts
} else {
int nall = atom->nfirst;
nbytes = sizeof(double) * atom->nfirst;
double **f = atom->f;
for (i = 0; i < nall; i++) {
f[i][0] = 0.0;
f[i][1] = 0.0;
f[i][2] = 0.0;
}
if (torqueflag) {
double **torque = atom->torque;
for (i = 0; i < nall; i++) {
torque[i][0] = 0.0;
torque[i][1] = 0.0;
torque[i][2] = 0.0;
}
}
if (erforceflag) {
double *erforce = atom->erforce;
for (i = 0; i < nall; i++) erforce[i] = 0.0;
}
if (e_flag) {
double *de = atom->de;
for (i = 0; i < nall; i++) de[i] = 0.0;
}
if (rho_flag) {
double *drho = atom->drho;
for (i = 0; i < nall; i++) drho[i] = 0.0;
if (nbytes) {
memset(&atom->f[0][0],0,3*nbytes);
if (torqueflag) memset(&atom->torque[0][0],0,3*nbytes);
if (extraflag) atom->avec->force_clear(0,nbytes);
}
if (force->newton) {
nall = atom->nlocal + atom->nghost;
nbytes = sizeof(double) * atom->nghost;
for (i = atom->nlocal; i < nall; i++) {
f[i][0] = 0.0;
f[i][1] = 0.0;
f[i][2] = 0.0;
}
if (torqueflag) {
double **torque = atom->torque;
for (i = atom->nlocal; i < nall; i++) {
torque[i][0] = 0.0;
torque[i][1] = 0.0;
torque[i][2] = 0.0;
}
}
if (erforceflag) {
double *erforce = atom->erforce;
for (i = atom->nlocal; i < nall; i++) erforce[i] = 0.0;
}
if (e_flag) {
double *de = atom->de;
for (i = 0; i < nall; i++) de[i] = 0.0;
}
if (rho_flag) {
double *drho = atom->drho;
for (i = 0; i < nall; i++) drho[i] = 0.0;
if (nbytes) {
memset(&atom->f[nlocal][0],0,3*nbytes);
if (torqueflag) memset(&atom->torque[nlocal][0],0,3*nbytes);
if (extraflag) atom->avec->force_clear(nlocal,nbytes);
}
}
}

View File

@ -39,8 +39,7 @@ class VerletIntel : public Integrate {
protected:
int triclinic; // 0 if domain is orthog, 1 if triclinic
int torqueflag,erforceflag;
int e_flag,rho_flag;
int torqueflag,extraflag;
virtual void force_clear();
#ifdef _LMP_INTEL_OFFLOAD

View File

@ -52,6 +52,9 @@ VerletSplitIntel::VerletSplitIntel(LAMMPS *lmp, int narg, char **arg) :
if (universe->procs_per_world[0] % universe->procs_per_world[1])
error->universe_all(FLERR,"Verlet/split requires Rspace partition "
"size be multiple of Kspace partition size");
if (comm->style != 0)
error->universe_all(FLERR,"Verlet/split can only currently be used with "
"comm_style brick");
// master = 1 for Rspace procs, 0 for Kspace procs
@ -214,6 +217,9 @@ VerletSplitIntel::~VerletSplitIntel()
void VerletSplitIntel::init()
{
if (comm->style != 0)
error->universe_all(FLERR,"Verlet/split can only currently be used with "
"comm_style brick");
if (!force->kspace && comm->me == 0)
error->warning(FLERR,"No Kspace calculation with verlet/split");

View File

@ -48,11 +48,6 @@ static double mypow(double x, int n) {
}
typedef struct { double x,y,z; } dbl3_t;
#if defined(__GNUC__)
#define _noalias __restrict
#else
#define _noalias
#endif
/* ---------------------------------------------------------------------- */

View File

@ -22,11 +22,6 @@
using namespace LAMMPS_NS;
typedef struct { double x,y,z; } dbl3_t;
#if defined(__GNUC__)
#define _noalias __restrict
#else
#define _noalias
#endif
/* ----------------------------------------------------------------------
enforce PBC and modify box image flags for each atom

View File

@ -33,11 +33,6 @@ using namespace FixConst;
enum{NOBIAS,BIAS};
typedef struct { double x,y,z; } dbl3_t;
#if defined(__GNUC__)
#define _noalias __restrict
#else
#define _noalias
#endif
/* ---------------------------------------------------------------------- */

View File

@ -34,11 +34,6 @@ enum{ISO,ANISO,TRICLINIC};
#define TILTMAX 1.5
typedef struct { double x,y,z; } dbl3_t;
#if defined(__GNUC__)
#define _noalias __restrict
#else
#define _noalias
#endif
/* ----------------------------------------------------------------------
change box size

View File

@ -31,11 +31,6 @@ enum{NOBIAS,BIAS};
#define INERTIA 0.4 // moment of inertia prefactor for sphere
typedef struct { double x,y,z; } dbl3_t;
#if defined(__GNUC__)
#define _noalias __restrict
#else
#define _noalias
#endif
/* ---------------------------------------------------------------------- */

View File

@ -19,11 +19,6 @@ using namespace LAMMPS_NS;
using namespace FixConst;
typedef struct { double x,y,z; } dbl3_t;
#if defined(__GNUC__)
#define _noalias __restrict
#else
#define _noalias
#endif
/* ---------------------------------------------------------------------- */

View File

@ -34,11 +34,6 @@ using namespace FixConst;
enum{NO_REMAP,X_REMAP,V_REMAP}; // same as fix_deform.cpp
typedef struct { double x,y,z; } dbl3_t;
#if defined(__GNUC__)
#define _noalias __restrict
#else
#define _noalias
#endif
/* ---------------------------------------------------------------------- */

View File

@ -48,11 +48,6 @@ enum{ISO,ANISO,TRICLINIC}; // same as in FixRigid
#define EINERTIA 0.4 // moment of inertia prefactor for ellipsoid
typedef struct { double x,y,z; } dbl3_t;
#if defined(__GNUC__)
#define _noalias __restrict
#else
#define _noalias
#endif
/* ----------------------------------------------------------------------
perform preforce velocity Verlet integration

View File

@ -42,11 +42,6 @@ enum{SINGLE,MOLECULE,GROUP}; // same as in FixRigid
#define EINERTIA 0.4 // moment of inertia prefactor for ellipsoid
typedef struct { double x,y,z; } dbl3_t;
#if defined(__GNUC__)
#define _noalias __restrict
#else
#define _noalias
#endif
/* ---------------------------------------------------------------------- */

View File

@ -42,11 +42,6 @@ using namespace MathConst;
enum{FULL_BODY,INITIAL,FINAL,FORCE_TORQUE,VCM_ANGMOM,XCM_MASS,ITENSOR,DOF};
typedef struct { double x,y,z; } dbl3_t;
#if defined(__GNUC__)
#define _noalias __restrict
#else
#define _noalias
#endif
/* ---------------------------------------------------------------------- */

View File

@ -195,14 +195,4 @@ typedef struct { int a,b,c,d,t; } int5_t;
}
#ifdef _noalias
#undef _noalias
#endif
#if defined(__GNUC__)
#define _noalias __restrict
#else
#define _noalias
#endif
#endif

View File

@ -167,6 +167,36 @@ typedef int bigint;
}
// preprocessor macros for compiler specific settings
// clear previous definitions to avoid redefinition warning
#ifdef _alignvar
#undef _alignvar
#endif
#ifdef _noalias
#undef _noalias
#endif
// define stack variable alignment
#if defined(__INTEL_COMPILER)
#define _alignvar(expr,val) __declspec(align(val)) expr
#elif defined(__GNUC__)
#define _alignvar(expr,val) expr __attribute((aligned(val)))
#else
#define _alignvar(expr,val) expr
#endif
// declaration to lift aliasing restrictions
#if defined(__INTEL_COMPILER)
#define _noalias restrict
#elif defined(__GNUC__)
#define _noalias __restrict
#else
#define _noalias
#endif
// settings to enable LAMMPS to build under Windows
#ifdef _WIN32

View File

@ -15,7 +15,6 @@
#define ATOBIGINT _atoi64
#define pclose _pclose
#define __restrict__ __restrict
// the following functions ared defined to get rid of
// 'ambiguous call to overloaded function' error in VSS for mismathched type arguments