From 777e82995de257844ca6d7101acdc78219d1c139 Mon Sep 17 00:00:00 2001 From: sjplimp Date: Fri, 5 Sep 2014 14:42:44 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12406 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/OPT/pair_eam_opt.cpp | 38 +++--- src/OPT/pair_lj_charmm_coul_long_opt.cpp | 30 ++--- src/OPT/pair_lj_cut_opt.cpp | 26 ++--- src/OPT/pair_morse_opt.cpp | 26 ++--- src/USER-INTEL/README | 8 +- src/USER-INTEL/fix_intel.h | 44 +++---- src/USER-INTEL/intel_buffers.cpp | 6 +- src/USER-INTEL/intel_buffers.h | 7 +- src/USER-INTEL/intel_preprocess.h | 35 ++++-- src/USER-INTEL/math_extra_intel.h | 2 +- src/USER-INTEL/neigh_half_bin_intel.cpp | 109 +++++++++--------- src/USER-INTEL/pair_gayberne_intel.cpp | 59 ++++++---- src/USER-INTEL/pair_gayberne_intel.h | 5 +- .../pair_lj_charmm_coul_long_intel.cpp | 42 +++---- .../pair_lj_charmm_coul_long_intel.h | 4 +- .../pair_lj_cut_coul_long_intel.cpp | 42 +++---- src/USER-INTEL/pair_lj_cut_coul_long_intel.h | 4 +- src/USER-INTEL/pair_lj_cut_intel.cpp | 28 ++--- src/USER-INTEL/pair_lj_cut_intel.h | 2 +- src/USER-INTEL/verlet_intel.cpp | 98 ++++------------ src/USER-INTEL/verlet_intel.h | 3 +- src/USER-INTEL/verlet_split_intel.cpp | 6 + src/USER-MISC/pair_list.cpp | 5 - src/USER-OMP/domain_omp.cpp | 5 - src/USER-OMP/fix_nh_asphere_omp.cpp | 5 - src/USER-OMP/fix_nh_omp.cpp | 5 - src/USER-OMP/fix_nh_sphere_omp.cpp | 5 - src/USER-OMP/fix_nve_omp.cpp | 5 - src/USER-OMP/fix_nvt_sllod_omp.cpp | 5 - src/USER-OMP/fix_rigid_nh_omp.cpp | 5 - src/USER-OMP/fix_rigid_omp.cpp | 5 - src/USER-OMP/fix_rigid_small_omp.cpp | 5 - src/USER-OMP/thr_omp.h | 10 -- src/lmptype.h | 30 +++++ src/lmpwindows.h | 1 - 35 files changed, 340 insertions(+), 375 deletions(-) diff --git a/src/OPT/pair_eam_opt.cpp b/src/OPT/pair_eam_opt.cpp index ba0512df39..1f20db29a2 100644 --- a/src/OPT/pair_eam_opt.cpp +++ b/src/OPT/pair_eam_opt.cpp @@ -81,7 +81,7 @@ void PairEAMOpt::eval() int i,j,ii,jj,inum,jnum,itype,jtype; double evdwl = 0.0; - double* __restrict__ coeff; + double* _noalias coeff; // grow energy array if necessary @@ -93,13 +93,13 @@ void PairEAMOpt::eval() fp = (double *) memory->smalloc(nmax*sizeof(double),"pair:fp"); } - double** __restrict__ x = atom->x; - double** __restrict__ f = atom->f; - int* __restrict__ type = atom->type; + double** _noalias x = atom->x; + double** _noalias f = atom->f; + int* _noalias type = atom->type; int nlocal = atom->nlocal; - vec3_t* __restrict__ xx = (vec3_t*)x[0]; - vec3_t* __restrict__ ff = (vec3_t*)f[0]; + vec3_t* _noalias xx = (vec3_t*)x[0]; + vec3_t* _noalias ff = (vec3_t*)f[0]; double tmp_cutforcesq = cutforcesq; double tmp_rdr = rdr; @@ -107,17 +107,17 @@ void PairEAMOpt::eval() int nr1 = nr-1; inum = list->inum; - int* __restrict__ ilist = list->ilist; - int** __restrict__ firstneigh = list->firstneigh; - int* __restrict__ numneigh = list->numneigh; + int* _noalias ilist = list->ilist; + int** _noalias firstneigh = list->firstneigh; + int* _noalias numneigh = list->numneigh; int ntypes = atom->ntypes; int ntypes2 = ntypes*ntypes; - fast_alpha_t* __restrict__ fast_alpha = + fast_alpha_t* _noalias fast_alpha = (fast_alpha_t*) malloc(ntypes2*(nr+1)*sizeof(fast_alpha_t)); for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) { - fast_alpha_t* __restrict__ tab = &fast_alpha[i*ntypes*nr+j*nr]; + fast_alpha_t* _noalias tab = &fast_alpha[i*ntypes*nr+j*nr]; if (type2rhor[i+1][j+1] >= 0) { for(int m = 1; m <= nr; m++) { tab[m].rhor0i = rhor_spline[type2rhor[i+1][j+1]][m][6]; @@ -135,12 +135,12 @@ void PairEAMOpt::eval() } } } - fast_alpha_t* __restrict__ tabeight = fast_alpha; + fast_alpha_t* _noalias tabeight = fast_alpha; - fast_gamma_t* __restrict__ fast_gamma = + fast_gamma_t* _noalias fast_gamma = (fast_gamma_t*) malloc(ntypes2*(nr+1)*sizeof(fast_gamma_t)); for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) { - fast_gamma_t* __restrict__ tab = &fast_gamma[i*ntypes*nr+j*nr]; + fast_gamma_t* _noalias tab = &fast_gamma[i*ntypes*nr+j*nr]; if (type2rhor[i+1][j+1] >= 0) { for(int m = 1; m <= nr; m++) { tab[m].rhor4i = rhor_spline[type2rhor[i+1][j+1]][m][2]; @@ -168,7 +168,7 @@ void PairEAMOpt::eval() } } } - fast_gamma_t* __restrict__ tabss = fast_gamma; + fast_gamma_t* _noalias tabss = fast_gamma; // zero out density @@ -188,11 +188,11 @@ void PairEAMOpt::eval() double ytmp = xx[i].y; double ztmp = xx[i].z; itype = type[i] - 1; - int* __restrict__ jlist = firstneigh[i]; + int* _noalias jlist = firstneigh[i]; jnum = numneigh[i]; double tmprho = rho[i]; - fast_alpha_t* __restrict__ tabeighti = &tabeight[itype*ntypes*nr]; + fast_alpha_t* _noalias tabeighti = &tabeight[itype*ntypes*nr]; for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; @@ -265,14 +265,14 @@ void PairEAMOpt::eval() double ytmp = xx[i].y; double ztmp = xx[i].z; int itype1 = type[i] - 1; - int* __restrict__ jlist = firstneigh[i]; + int* _noalias jlist = firstneigh[i]; jnum = numneigh[i]; double tmpfx = 0.0; double tmpfy = 0.0; double tmpfz = 0.0; - fast_gamma_t* __restrict__ tabssi = &tabss[itype1*ntypes*nr]; + fast_gamma_t* _noalias tabssi = &tabss[itype1*ntypes*nr]; for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; diff --git a/src/OPT/pair_lj_charmm_coul_long_opt.cpp b/src/OPT/pair_lj_charmm_coul_long_opt.cpp index 5979141e3c..136b9fc64d 100644 --- a/src/OPT/pair_lj_charmm_coul_long_opt.cpp +++ b/src/OPT/pair_lj_charmm_coul_long_opt.cpp @@ -84,22 +84,22 @@ void PairLJCharmmCoulLongOpt::eval() double evdwl = 0.0; double ecoul = 0.0; - double** __restrict__ x = atom->x; - double** __restrict__ f = atom->f; - double* __restrict__ q = atom->q; - int* __restrict__ type = atom->type; + double** _noalias x = atom->x; + double** _noalias f = atom->f; + double* _noalias q = atom->q; + int* _noalias type = atom->type; int nlocal = atom->nlocal; - double* __restrict__ special_coul = force->special_coul; - double* __restrict__ special_lj = force->special_lj; + double* _noalias special_coul = force->special_coul; + double* _noalias special_lj = force->special_lj; double qqrd2e = force->qqrd2e; inum = list->inum; - int* __restrict__ ilist = list->ilist; - int** __restrict__ firstneigh = list->firstneigh; - int* __restrict__ numneigh = list->numneigh; + int* _noalias ilist = list->ilist; + int** _noalias firstneigh = list->firstneigh; + int* _noalias numneigh = list->numneigh; - vec3_t* __restrict__ xx = (vec3_t*)x[0]; - vec3_t* __restrict__ ff = (vec3_t*)f[0]; + vec3_t* _noalias xx = (vec3_t*)x[0]; + vec3_t* _noalias ff = (vec3_t*)f[0]; int ntypes = atom->ntypes; int ntypes2 = ntypes*ntypes; @@ -107,7 +107,7 @@ void PairLJCharmmCoulLongOpt::eval() double tmp_coef1 = 1.0/denom_lj; double tmp_coef2 = cut_ljsq - 3.0*cut_lj_innersq; - fast_alpha_t* __restrict__ fast_alpha = + fast_alpha_t* _noalias fast_alpha = (fast_alpha_t*)malloc(ntypes2*sizeof(fast_alpha_t)); for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) { fast_alpha_t& a = fast_alpha[i*ntypes+j]; @@ -117,7 +117,7 @@ void PairLJCharmmCoulLongOpt::eval() a.lj3 = lj3[i+1][j+1]; a.lj4 = lj4[i+1][j+1]; } - fast_alpha_t* __restrict__ tabsix = fast_alpha; + fast_alpha_t* _noalias tabsix = fast_alpha; // loop over neighbors of my atoms @@ -128,14 +128,14 @@ void PairLJCharmmCoulLongOpt::eval() double ytmp = xx[i].y; double ztmp = xx[i].z; itype = type[i] - 1; - int* __restrict__ jlist = firstneigh[i]; + int* _noalias jlist = firstneigh[i]; jnum = numneigh[i]; double tmpfx = 0.0; double tmpfy = 0.0; double tmpfz = 0.0; - fast_alpha_t* __restrict__ tabsixi = (fast_alpha_t*) &tabsix[itype*ntypes]; + fast_alpha_t* _noalias tabsixi = (fast_alpha_t*) &tabsix[itype*ntypes]; for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; diff --git a/src/OPT/pair_lj_cut_opt.cpp b/src/OPT/pair_lj_cut_opt.cpp index a3d90eae50..170a3aa728 100644 --- a/src/OPT/pair_lj_cut_opt.cpp +++ b/src/OPT/pair_lj_cut_opt.cpp @@ -67,24 +67,24 @@ void PairLJCutOpt::eval() double factor_lj; double evdwl = 0.0; - double** __restrict__ x = atom->x; - double** __restrict__ f = atom->f; - int* __restrict__ type = atom->type; + double** _noalias x = atom->x; + double** _noalias f = atom->f; + int* _noalias type = atom->type; int nlocal = atom->nlocal; - double* __restrict__ special_lj = force->special_lj; + double* _noalias special_lj = force->special_lj; inum = list->inum; - int* __restrict__ ilist = list->ilist; - int** __restrict__ firstneigh = list->firstneigh; - int* __restrict__ numneigh = list->numneigh; + int* _noalias ilist = list->ilist; + int** _noalias firstneigh = list->firstneigh; + int* _noalias numneigh = list->numneigh; - vec3_t* __restrict__ xx = (vec3_t*)x[0]; - vec3_t* __restrict__ ff = (vec3_t*)f[0]; + vec3_t* _noalias xx = (vec3_t*)x[0]; + vec3_t* _noalias ff = (vec3_t*)f[0]; int ntypes = atom->ntypes; int ntypes2 = ntypes*ntypes; - fast_alpha_t* __restrict__ fast_alpha = + fast_alpha_t* _noalias fast_alpha = (fast_alpha_t*) malloc(ntypes2*sizeof(fast_alpha_t)); for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) { fast_alpha_t& a = fast_alpha[i*ntypes+j]; @@ -95,7 +95,7 @@ void PairLJCutOpt::eval() a.lj4 = lj4[i+1][j+1]; a.offset = offset[i+1][j+1]; } - fast_alpha_t* __restrict__ tabsix = fast_alpha; + fast_alpha_t* _noalias tabsix = fast_alpha; // loop over neighbors of my atoms @@ -105,14 +105,14 @@ void PairLJCutOpt::eval() double ytmp = xx[i].y; double ztmp = xx[i].z; itype = type[i] - 1; - int* __restrict__ jlist = firstneigh[i]; + int* _noalias jlist = firstneigh[i]; jnum = numneigh[i]; double tmpfx = 0.0; double tmpfy = 0.0; double tmpfz = 0.0; - fast_alpha_t* __restrict__ tabsixi = (fast_alpha_t*)&tabsix[itype*ntypes]; + fast_alpha_t* _noalias tabsixi = (fast_alpha_t*)&tabsix[itype*ntypes]; for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; diff --git a/src/OPT/pair_morse_opt.cpp b/src/OPT/pair_morse_opt.cpp index c27dd053ff..6fcb9843a6 100644 --- a/src/OPT/pair_morse_opt.cpp +++ b/src/OPT/pair_morse_opt.cpp @@ -68,24 +68,24 @@ void PairMorseOpt::eval() double factor_lj; double evdwl = 0.0; - double** __restrict__ x = atom->x; - double** __restrict__ f = atom->f; - int* __restrict__ type = atom->type; + double** _noalias x = atom->x; + double** _noalias f = atom->f; + int* _noalias type = atom->type; int nlocal = atom->nlocal; - double* __restrict__ special_lj = force->special_lj; + double* _noalias special_lj = force->special_lj; inum = list->inum; - int* __restrict__ ilist = list->ilist; - int** __restrict__ firstneigh = list->firstneigh; - int* __restrict__ numneigh = list->numneigh; + int* _noalias ilist = list->ilist; + int** _noalias firstneigh = list->firstneigh; + int* _noalias numneigh = list->numneigh; - vec3_t* __restrict__ xx = (vec3_t*)x[0]; - vec3_t* __restrict__ ff = (vec3_t*)f[0]; + vec3_t* _noalias xx = (vec3_t*)x[0]; + vec3_t* _noalias ff = (vec3_t*)f[0]; int ntypes = atom->ntypes; int ntypes2 = ntypes*ntypes; - fast_alpha_t* __restrict__ fast_alpha = + fast_alpha_t* _noalias fast_alpha = (fast_alpha_t*) malloc(ntypes2*sizeof(fast_alpha_t)); for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) { fast_alpha_t& a = fast_alpha[i*ntypes+j]; @@ -96,7 +96,7 @@ void PairMorseOpt::eval() a.d0 = d0[i+1][j+1]; a.offset = offset[i+1][j+1]; } - fast_alpha_t* __restrict__ tabsix = fast_alpha; + fast_alpha_t* _noalias tabsix = fast_alpha; // loop over neighbors of my atoms @@ -106,14 +106,14 @@ void PairMorseOpt::eval() double ytmp = xx[i].y; double ztmp = xx[i].z; itype = type[i] - 1; - int* __restrict__ jlist = firstneigh[i]; + int* _noalias jlist = firstneigh[i]; jnum = numneigh[i]; double tmpfx = 0.0; double tmpfy = 0.0; double tmpfz = 0.0; - fast_alpha_t* __restrict__ tabsixi = (fast_alpha_t*)&tabsix[itype*ntypes]; + fast_alpha_t* _noalias tabsixi = (fast_alpha_t*)&tabsix[itype*ntypes]; for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README index 27c60d237a..930cacdd38 100644 --- a/src/USER-INTEL/README +++ b/src/USER-INTEL/README @@ -38,8 +38,6 @@ Intel compilers. ----------------------------------------------------------------------------- -The files in this package must be compiled with the Intel C++ -compiler, i.e. icc/icpc. - - - +For portability reasons, vectorization directives are currently only enabled +for Intel compilers. Using other compilers may result in significantly +lower performance. diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h index 82ebc734a2..5b7d2b3926 100644 --- a/src/USER-INTEL/fix_intel.h +++ b/src/USER-INTEL/fix_intel.h @@ -128,7 +128,7 @@ class FixIntel : public Fix { protected: int _overflow_flag[5]; - __declspec(align(64)) int _off_overflow_flag[5]; + _alignvar(int _off_overflow_flag[5],64); int _allow_separate_buffers, _offload_ghost; #ifdef _LMP_INTEL_OFFLOAD double _balance_pair_time, _balance_other_time; @@ -155,18 +155,18 @@ class FixIntel : public Fix { double _offload_balance, _balance_neighbor, _balance_pair, _balance_fixed; double _timers[NUM_ITIMERS]; double _stopwatch[NUM_ITIMERS]; - __declspec(align(64)) double _stopwatch_offload_neighbor[1]; - __declspec(align(64)) double _stopwatch_offload_pair[1]; + _alignvar(double _stopwatch_offload_neighbor[1],64); + _alignvar(double _stopwatch_offload_pair[1],64); template - inline void add_results(const ft * restrict const f_in, - const acc_t * restrict const ev_global, + inline void add_results(const ft * _noalias const f_in, + const acc_t * _noalias const ev_global, const int eatom, const int vatom, const int offload); template - inline void add_oresults(const ft * restrict const f_in, - const acc_t * restrict const ev_global, + inline void add_oresults(const ft * _noalias const f_in, + const acc_t * _noalias const ev_global, const int eatom, const int vatom, const int out_offset, const int nall); @@ -176,8 +176,8 @@ class FixIntel : public Fix { int _im_real_space_task; MPI_Comm _real_space_comm; template - inline void add_off_results(const ft * restrict const f_in, - const acc_t * restrict const ev_global); + inline void add_off_results(const ft * _noalias const f_in, + const acc_t * _noalias const ev_global); #endif }; @@ -284,8 +284,8 @@ void FixIntel::add_result_array(IntelBuffers::vec3_acc_t *f_in, /* ---------------------------------------------------------------------- */ template -void FixIntel::add_results(const ft * restrict const f_in, - const acc_t * restrict const ev_global, +void FixIntel::add_results(const ft * _noalias const f_in, + const acc_t * _noalias const ev_global, const int eatom, const int vatom, const int offload) { start_watch(TIME_PACK); @@ -295,7 +295,7 @@ void FixIntel::add_results(const ft * restrict const f_in, if (offload) { add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal); if (force->newton_pair) { - const acc_t * restrict const enull = 0; + const acc_t * _noalias const enull = 0; int offset = _offload_nlocal; if (atom->torque) offset *= 2; add_oresults(f_in + offset, enull, eatom, vatom, @@ -305,7 +305,7 @@ void FixIntel::add_results(const ft * restrict const f_in, add_oresults(f_in, ev_global, eatom, vatom, _host_min_local, _host_used_local); if (force->newton_pair) { - const acc_t * restrict const enull = 0; + const acc_t * _noalias const enull = 0; int offset = _host_used_local; if (atom->torque) offset *= 2; add_oresults(f_in + offset, enull, eatom, @@ -333,11 +333,11 @@ void FixIntel::add_results(const ft * restrict const f_in, /* ---------------------------------------------------------------------- */ template -void FixIntel::add_oresults(const ft * restrict const f_in, - const acc_t * restrict const ev_global, +void FixIntel::add_oresults(const ft * _noalias const f_in, + const acc_t * _noalias const ev_global, const int eatom, const int vatom, const int out_offset, const int nall) { - lmp_ft * restrict const f = (lmp_ft *) lmp->atom->f[0] + out_offset; + lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset; if (atom->torque) { if (f_in[1].w) if (f_in[1].w == 1) @@ -351,12 +351,16 @@ void FixIntel::add_oresults(const ft * restrict const f_in, #pragma omp parallel default(none) #endif { + #if defined(_OPENMP) const int tid = omp_get_thread_num(); + #else + const int tid = 0; + #endif int ifrom, ito; IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t)); if (atom->torque) { int ii = ifrom * 2; - lmp_ft * restrict const tor = (lmp_ft *) lmp->atom->torque[0] + + lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] + out_offset; if (eatom) { for (int i = ifrom; i < ito; i++) { @@ -440,6 +444,7 @@ void FixIntel::balance_stamp() { /* ---------------------------------------------------------------------- */ void FixIntel::acc_timers() { + _timers[TIME_OFFLOAD_PAIR] += *_stopwatch_offload_pair; if (neighbor->ago == 0) { _timers[TIME_OFFLOAD_NEIGHBOR] += *_stopwatch_offload_neighbor; if (_setup_time_cleared == false) { @@ -447,7 +452,6 @@ void FixIntel::acc_timers() { _setup_time_cleared = true; } } - _timers[TIME_OFFLOAD_PAIR] += *_stopwatch_offload_pair; } /* ---------------------------------------------------------------------- */ @@ -464,8 +468,8 @@ void FixIntel::set_neighbor_host_sizes() { /* ---------------------------------------------------------------------- */ template -void FixIntel::add_off_results(const ft * restrict const f_in, - const acc_t * restrict const ev_global) { +void FixIntel::add_off_results(const ft * _noalias const f_in, + const acc_t * _noalias const ev_global) { if (_offload_balance < 0.0) _balance_other_time = MPI_Wtime() - _balance_other_time; diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp index a541f0f359..d88d4cb377 100644 --- a/src/USER-INTEL/intel_buffers.cpp +++ b/src/USER-INTEL/intel_buffers.cpp @@ -22,8 +22,8 @@ using namespace LAMMPS_NS; template IntelBuffers::IntelBuffers(class LAMMPS *lmp_in) : - lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _buf_size(0), - _buf_local_size(0), _off_threads(0) { + lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _off_threads(0), + _buf_size(0), _buf_local_size(0) { _list_alloc_atoms = 0; _ntypes = 0; _off_map_maxlocal = 0; @@ -423,6 +423,8 @@ double IntelBuffers::memory_usage(const int nthreads) tmem += _off_map_maxlocal * sizeof(int); tmem += (_list_alloc_atoms + _off_threads) * get_max_nbors() * sizeof(int); tmem += _ntypes * _ntypes * sizeof(int); + + return tmem; } /* ---------------------------------------------------------------------- */ diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h index bc1ca9e3b8..4b60c87f04 100644 --- a/src/USER-INTEL/intel_buffers.h +++ b/src/USER-INTEL/intel_buffers.h @@ -235,7 +235,8 @@ class IntelBuffers { double memory_usage(const int nthreads); - int _special_holder, _nspecial_holder; + tagint _special_holder; + int _nspecial_holder; protected: LAMMPS *lmp; @@ -266,8 +267,8 @@ class IntelBuffers { #endif int _buf_size, _buf_local_size; - __declspec(align(64)) acc_t _ev_global[8]; - __declspec(align(64)) acc_t _ev_global_host[8]; + _alignvar(acc_t _ev_global[8],64); + _alignvar(acc_t _ev_global_host[8],64); void _grow(const int nall, const int nlocal, const int nthreads, const int offload_end); diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h index 49e3413e0a..da68ecc934 100644 --- a/src/USER-INTEL/intel_preprocess.h +++ b/src/USER-INTEL/intel_preprocess.h @@ -26,6 +26,22 @@ #ifndef LAMMPS_MEMALIGN #error Please set -DLAMMPS_MEMALIGN=64 in CCFLAGS for your LAMMPS makefile. +#else +#if (LAMMPS_MEMALIGN != 64) +#error Please set -DLAMMPS_MEMALIGN=64 in CCFLAGS for your LAMMPS makefile. +#endif +#endif + +#if defined(_OPENMP) +#define _use_omp_pragma(txt) _Pragma(txt) +#else +#define _use_omp_pragma(txt) +#endif + +#if defined(__INTEL_COMPILER) +#define _use_simd_pragma(txt) _Pragma(txt) +#else +#define _use_simd_pragma(txt) #endif namespace LAMMPS_NS { @@ -141,7 +157,7 @@ inline double MIC_Wtime() { if (fix->separate_buffers() && ago != 0) { \ fix->start_watch(TIME_PACK); \ if (offload) { \ - _Pragma("omp parallel default(none) shared(buffers,nlocal,nall)") \ + _use_omp_pragma("omp parallel default(none) shared(buffers,nlocal,nall)") \ { \ int ifrom, ito, tid; \ int nthreads = comm->nthreads; \ @@ -343,15 +359,16 @@ inline double MIC_Wtime() { else \ o_range = nlocal; \ if (offload == 0) o_range -= minlocal; \ - IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads, \ + IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads, \ sizeof(acc_t)); \ \ int t_off = f_stride; \ if (eflag && eatom) { \ for (int t = 1; t < nthreads; t++) { \ - _Pragma("vector nontemporal") \ + _use_simd_pragma("vector nontemporal") \ + _use_simd_pragma("novector") \ for (int n = iifrom; n < iito; n++) { \ - f_start[n].x += f_start[n + t_off].x; \ + f_start[n].x += f_start[n + t_off].x; \ f_start[n].y += f_start[n + t_off].y; \ f_start[n].z += f_start[n + t_off].z; \ f_start[n].w += f_start[n + t_off].w; \ @@ -360,8 +377,9 @@ inline double MIC_Wtime() { } \ } else { \ for (int t = 1; t < nthreads; t++) { \ - _Pragma("vector nontemporal") \ - for (int n = iifrom; n < iito; n++) { \ + _use_simd_pragma("vector nontemporal") \ + _use_simd_pragma("novector") \ + for (int n = iifrom; n < iito; n++) { \ f_start[n].x += f_start[n + t_off].x; \ f_start[n].y += f_start[n + t_off].y; \ f_start[n].z += f_start[n + t_off].z; \ @@ -372,8 +390,9 @@ inline double MIC_Wtime() { \ if (evflag) { \ if (vflag == 2) { \ - const ATOM_T * restrict const xo = x + minlocal; \ - _Pragma("vector nontemporal") \ + const ATOM_T * _noalias const xo = x + minlocal; \ + _use_simd_pragma("vector nontemporal") \ + _use_simd_pragma("novector") \ for (int n = iifrom; n < iito; n++) { \ ov0 += f_start[n].x * xo[n].x; \ ov1 += f_start[n].y * xo[n].y; \ diff --git a/src/USER-INTEL/math_extra_intel.h b/src/USER-INTEL/math_extra_intel.h index 62163b3f60..780d75b79b 100644 --- a/src/USER-INTEL/math_extra_intel.h +++ b/src/USER-INTEL/math_extra_intel.h @@ -287,7 +287,7 @@ if (fabs(aug_8) > fabs(aug_0)) { \ flt_t swapt; \ swapt = aug_0; aug_0 = aug_8; aug_8 = swapt; \ - swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \ + swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \ swapt = aug_2; aug_2 = aug_10; aug_10 = swapt; \ swapt = aug_3; aug_3 = aug_11; aug_11 = swapt; \ } \ diff --git a/src/USER-INTEL/neigh_half_bin_intel.cpp b/src/USER-INTEL/neigh_half_bin_intel.cpp index a5f12a56f9..162a73c83f 100644 --- a/src/USER-INTEL/neigh_half_bin_intel.cpp +++ b/src/USER-INTEL/neigh_half_bin_intel.cpp @@ -79,7 +79,7 @@ inline int mcoord2bin(const flt_t x0, const flt_t x1, const flt_t x2, const int n1 = nspecial[i * 3]; \ const int n2 = nspecial[i * 3 + 1]; \ const int n3 = nspecial[i * 3 + 2]; \ - const int *sptr = special + i * maxspecial; \ + const tagint *sptr = special + i * maxspecial; \ for (int s = 0; s < n3; s++) { \ if (sptr[s] == tag) { \ if (s < n1) { \ @@ -105,7 +105,7 @@ inline int mcoord2bin(const flt_t x0, const flt_t x1, const flt_t x2, template void Neighbor::bin_atoms(void * xin) { - const ATOM_T * restrict const x = (const ATOM_T * restrict const)xin; + const ATOM_T * _noalias const x = (const ATOM_T * _noalias const)xin; int nlocal = atom->nlocal; const int nall = nlocal + atom->nghost; @@ -243,11 +243,12 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in, return; } - const ATOM_T * restrict const x = buffers->get_x(); - int * restrict const firstneigh = buffers->firstneigh(list); + const ATOM_T * _noalias const x = buffers->get_x(); + int * _noalias const firstneigh = buffers->firstneigh(list); const int molecular = atom->molecular; - int *ns = NULL, *s = NULL; + int *ns = NULL; + tagint *s = NULL; int tag_size, special_size; if (molecular) { s = atom->special[0]; @@ -260,23 +261,23 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in, tag_size = 0; special_size = 0; } - const int * restrict const special = s; - const int * restrict const nspecial = ns; + const tagint * _noalias const special = s; + const int * _noalias const nspecial = ns; const int maxspecial = atom->maxspecial; - const int * restrict const tag = atom->tag; + const tagint * _noalias const tag = atom->tag; - int * restrict const ilist = list->ilist; - int * restrict numneigh = list->numneigh; - int * restrict const cnumneigh = buffers->cnumneigh(list); + int * _noalias const ilist = list->ilist; + int * _noalias numneigh = list->numneigh; + int * _noalias const cnumneigh = buffers->cnumneigh(list); const int nstencil = list->nstencil; - const int * restrict const stencil = list->stencil; - const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0]; + const int * _noalias const stencil = list->stencil; + const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0]; const int ntypes = atom->ntypes + 1; const int nlocal = atom->nlocal; #ifndef _LMP_INTEL_OFFLOAD int * const mask = atom->mask; - int * const molecule = atom->molecule; + tagint * const molecule = atom->molecule; #endif int tnum; @@ -316,8 +317,8 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in, } #ifdef _LMP_INTEL_OFFLOAD - const int * restrict const binhead = this->binhead; - const int * restrict const special_flag = this->special_flag; + const int * _noalias const binhead = this->binhead; + const int * _noalias const special_flag = this->special_flag; const int nbinx = this->nbinx; const int nbiny = this->nbiny; const int nbinz = this->nbinz; @@ -327,7 +328,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in, const int mbinx = this->mbinx; const int mbiny = this->mbiny; const int mbinz = this->mbinz; - const int * restrict const bins = this->bins; + const int * _noalias const bins = this->bins; const int cop = fix->coprocessor_number(); const int separate_buffers = fix->separate_buffers(); #pragma offload target(mic:cop) if(offload) \ @@ -486,7 +487,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in, if (molecular) { for (int i = ifrom; i < ito; ++i) { - int * restrict jlist = firstneigh + cnumneigh[i]; + int * _noalias jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; for (int jj = 0; jj < jnum; jj++) { const int j = jlist[jj]; @@ -507,7 +508,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in, #ifdef _LMP_INTEL_OFFLOAD else if (separate_buffers) { for (int i = ifrom; i < ito; ++i) { - int * restrict jlist = firstneigh + cnumneigh[i]; + int * _noalias jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; int jj = 0; for (jj = 0; jj < jnum; jj++) @@ -662,14 +663,15 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, return; } - const ATOM_T * restrict const x = buffers->get_x(); - int * restrict const firstneigh = buffers->firstneigh(list); + const ATOM_T * _noalias const x = buffers->get_x(); + int * _noalias const firstneigh = buffers->firstneigh(list); int nall_t = nall; if (offload_noghost && offload) nall_t = atom->nlocal; const int e_nall = nall_t; const int molecular = atom->molecular; - int *ns = NULL, *s = NULL; + int *ns = NULL; + tagint *s = NULL; int tag_size, special_size; if (molecular) { s = atom->special[0]; @@ -682,23 +684,23 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, tag_size = 0; special_size = 0; } - const int * restrict const special = s; - const int * restrict const nspecial = ns; + const tagint * _noalias const special = s; + const int * _noalias const nspecial = ns; const int maxspecial = atom->maxspecial; - const int * restrict const tag = atom->tag; + const tagint * _noalias const tag = atom->tag; - int * restrict const ilist = list->ilist; - int * restrict numneigh = list->numneigh; - int * restrict const cnumneigh = buffers->cnumneigh(list); + int * _noalias const ilist = list->ilist; + int * _noalias numneigh = list->numneigh; + int * _noalias const cnumneigh = buffers->cnumneigh(list); const int nstencil = list->nstencil; - const int * restrict const stencil = list->stencil; - const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0]; + const int * _noalias const stencil = list->stencil; + const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0]; const int ntypes = atom->ntypes + 1; const int nlocal = atom->nlocal; #ifndef _LMP_INTEL_OFFLOAD int * const mask = atom->mask; - int * const molecule = atom->molecule; + tagint * const molecule = atom->molecule; #endif int tnum; @@ -737,8 +739,8 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, } #ifdef _LMP_INTEL_OFFLOAD - const int * restrict const binhead = this->binhead; - const int * restrict const special_flag = this->special_flag; + const int * _noalias const binhead = this->binhead; + const int * _noalias const special_flag = this->special_flag; const int nbinx = this->nbinx; const int nbiny = this->nbiny; const int nbinz = this->nbinz; @@ -748,7 +750,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, const int mbinx = this->mbinx; const int mbiny = this->mbiny; const int mbinz = this->mbinz; - const int * restrict const bins = this->bins; + const int * _noalias const bins = this->bins; const int cop = fix->coprocessor_number(); const int separate_buffers = fix->separate_buffers(); #pragma offload target(mic:cop) if(offload) \ @@ -948,7 +950,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, if (molecular) { for (int i = ifrom; i < ito; ++i) { - int * restrict jlist = firstneigh + cnumneigh[i]; + int * _noalias jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; for (int jj = 0; jj < jnum; jj++) { const int j = jlist[jj]; @@ -970,7 +972,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in, #ifdef _LMP_INTEL_OFFLOAD else if (separate_buffers) { for (int i = ifrom; i < ito; ++i) { - int * restrict jlist = firstneigh + cnumneigh[i]; + int * _noalias jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; int jj = 0; for (jj = 0; jj < jnum; jj++) @@ -1127,14 +1129,15 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in, return; } - const ATOM_T * restrict const x = buffers->get_x(); - int * restrict const firstneigh = buffers->firstneigh(list); + const ATOM_T * _noalias const x = buffers->get_x(); + int * _noalias const firstneigh = buffers->firstneigh(list); int nall_t = nall; if (offload_noghost && offload) nall_t = atom->nlocal; const int e_nall = nall_t; const int molecular = atom->molecular; - int *ns = NULL, *s = NULL; + int *ns = NULL; + tagint *s = NULL; int tag_size, special_size; if (molecular) { s = atom->special[0]; @@ -1147,23 +1150,23 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in, tag_size = 0; special_size = 0; } - const int * restrict const special = s; - const int * restrict const nspecial = ns; + const tagint * _noalias const special = s; + const int * _noalias const nspecial = ns; const int maxspecial = atom->maxspecial; - const int * restrict const tag = atom->tag; + const tagint * _noalias const tag = atom->tag; - int * restrict const ilist = list->ilist; - int * restrict numneigh = list->numneigh; - int * restrict const cnumneigh = buffers->cnumneigh(list); + int * _noalias const ilist = list->ilist; + int * _noalias numneigh = list->numneigh; + int * _noalias const cnumneigh = buffers->cnumneigh(list); const int nstencil = list->nstencil; - const int * restrict const stencil = list->stencil; - const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0]; + const int * _noalias const stencil = list->stencil; + const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0]; const int ntypes = atom->ntypes + 1; const int nlocal = atom->nlocal; #ifndef _LMP_INTEL_OFFLOAD int * const mask = atom->mask; - int * const molecule = atom->molecule; + tagint * const molecule = atom->molecule; #endif int tnum; @@ -1202,8 +1205,8 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in, } #ifdef _LMP_INTEL_OFFLOAD - const int * restrict const binhead = this->binhead; - const int * restrict const special_flag = this->special_flag; + const int * _noalias const binhead = this->binhead; + const int * _noalias const special_flag = this->special_flag; const int nbinx = this->nbinx; const int nbiny = this->nbiny; const int nbinz = this->nbinz; @@ -1213,7 +1216,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in, const int mbinx = this->mbinx; const int mbiny = this->mbiny; const int mbinz = this->mbinz; - const int * restrict const bins = this->bins; + const int * _noalias const bins = this->bins; const int cop = fix->coprocessor_number(); const int separate_buffers = fix->separate_buffers(); #pragma offload target(mic:cop) if(offload) \ @@ -1386,7 +1389,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in, if (molecular) { for (int i = ifrom; i < ito; ++i) { - int * restrict jlist = firstneigh + cnumneigh[i]; + int * _noalias jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; for (int jj = 0; jj < jnum; jj++) { const int j = jlist[jj]; @@ -1407,7 +1410,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in, #ifdef _LMP_INTEL_OFFLOAD else if (separate_buffers) { for (int i = ifrom; i < ito; ++i) { - int * restrict jlist = firstneigh + cnumneigh[i]; + int * _noalias jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; int jj = 0; for (jj = 0; jj < jnum; jj++) diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp index 46e608c92f..aae42a7145 100644 --- a/src/USER-INTEL/pair_gayberne_intel.cpp +++ b/src/USER-INTEL/pair_gayberne_intel.cpp @@ -79,7 +79,7 @@ void PairGayBerneIntel::compute(int eflag, int vflag, fix->start_watch(TIME_PACK); const AtomVecEllipsoid::Bonus * const bonus = avec->bonus; const int * const ellipsoid = atom->ellipsoid; - QUAT_T * restrict const quat = buffers->get_quat(); + QUAT_T * _noalias const quat = buffers->get_quat(); #if defined(_OPENMP) #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc) #endif @@ -150,8 +150,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, fix->get_buffern(offload, nlocal, nall, minlocal); const int ago = neighbor->ago; - ATOM_T * restrict const x = buffers->get_x(offload); - QUAT_T * restrict const quat = buffers->get_quat(offload); + ATOM_T * _noalias const x = buffers->get_x(offload); + QUAT_T * _noalias const quat = buffers->get_quat(offload); const AtomVecEllipsoid::Bonus *bonus = avec->bonus; const int *ellipsoid = atom->ellipsoid; @@ -225,15 +225,15 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, } #endif - // const int * restrict const ilist = list->ilist; - const int * restrict const numneigh = list->numneigh; - const int * restrict const cnumneigh = buffers->cnumneigh(list); - const int * restrict const firstneigh = buffers->firstneigh(list); - const flt_t * restrict const special_lj = fc.special_lj; + // const int * _noalias const ilist = list->ilist; + const int * _noalias const numneigh = list->numneigh; + const int * _noalias const cnumneigh = buffers->cnumneigh(list); + const int * _noalias const firstneigh = buffers->firstneigh(list); + const flt_t * _noalias const special_lj = fc.special_lj; - const FC_PACKED1_T * restrict const ijc = fc.ijc[0]; - const FC_PACKED2_T * restrict const lj34 = fc.lj34[0]; - const FC_PACKED3_T * restrict const ic = fc.ic; + const FC_PACKED1_T * _noalias const ijc = fc.ijc[0]; + const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0]; + const FC_PACKED3_T * _noalias const ic = fc.ic; const flt_t mu = fc.mu; const flt_t gamma = fc.gamma; const flt_t upsilon = fc.upsilon; @@ -255,8 +255,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, x_size, q_size, ev_size, f_stride); int tc; - FORCE_T * restrict f_start; - acc_t * restrict ev_global; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global); const int max_nbors = _max_nbors; const int nthreads = tc; @@ -351,25 +351,25 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, iifrom += astart; iito += astart; - FORCE_T * restrict const f = f_start - minlocal * 2 + (tid * f_stride); + FORCE_T * _noalias const f = f_start - minlocal * 2 + (tid * f_stride); memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T)); - flt_t * restrict const rsq_form = rsq_formi + tid * max_nbors; - flt_t * restrict const delx_form = delx_formi + tid * max_nbors; - flt_t * restrict const dely_form = dely_formi + tid * max_nbors; - flt_t * restrict const delz_form = delz_formi + tid * max_nbors; - int * restrict const jtype_form = jtype_formi + tid * max_nbors; - int * restrict const jlist_form = jlist_formi + tid * max_nbors; + flt_t * _noalias const rsq_form = rsq_formi + tid * max_nbors; + flt_t * _noalias const delx_form = delx_formi + tid * max_nbors; + flt_t * _noalias const dely_form = dely_formi + tid * max_nbors; + flt_t * _noalias const delz_form = delz_formi + tid * max_nbors; + int * _noalias const jtype_form = jtype_formi + tid * max_nbors; + int * _noalias const jlist_form = jlist_formi + tid * max_nbors; int ierror = 0; for (int i = iifrom; i < iito; ++i) { // const int i = ilist[ii]; const int itype = x[i].w; const int ptr_off = itype * ntypes; - const FC_PACKED1_T * restrict const ijci = ijc + ptr_off; - const FC_PACKED2_T * restrict const lj34i = lj34 + ptr_off; + const FC_PACKED1_T * _noalias const ijci = ijc + ptr_off; + const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off; - const int * restrict const jlist = firstneigh + cnumneigh[i]; + const int * _noalias const jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; const flt_t xtmp = x[i].x; @@ -433,9 +433,11 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, __assume(packed_j % 8 == 0); __assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0); #endif + #if defined(__INTEL_COMPILER) #pragma vector aligned #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \ sevdwl,sv0,sv1,sv2,sv3,sv4,sv5) + #endif for (int jj = 0; jj < packed_j; jj++) { flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8; flt_t b2_0, b2_1, b2_2, b2_3, b2_4, b2_5, b2_6, b2_7, b2_8; @@ -796,7 +798,10 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, int t_off = f_stride; if (EFLAG && eatom) { for (int t = 1; t < nthreads; t++) { + #if defined(__INTEL_COMPILER) #pragma vector nontemporal + #pragma novector + #endif for (int n = iifrom * 2; n < two_iito; n++) { f_start[n].x += f_start[n + t_off].x; f_start[n].y += f_start[n + t_off].y; @@ -807,7 +812,10 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, } } else { for (int t = 1; t < nthreads; t++) { + #if defined(__INTEL_COMPILER) #pragma vector nontemporal + #pragma novector + #endif for (int n = iifrom * 2; n < two_iito; n++) { f_start[n].x += f_start[n + t_off].x; f_start[n].y += f_start[n + t_off].y; @@ -819,8 +827,11 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, if (EVFLAG) { if (vflag==2) { - const ATOM_T * restrict const xo = x + minlocal; + const ATOM_T * _noalias const xo = x + minlocal; + #if defined(__INTEL_COMPILER) #pragma vector nontemporal + #pragma novector + #endif for (int n = iifrom; n < iito; n++) { const int nt2 = n * 2; ov0 += f_start[nt2].x * xo[n].x; diff --git a/src/USER-INTEL/pair_gayberne_intel.h b/src/USER-INTEL/pair_gayberne_intel.h index eb055e151e..9a4aae6c72 100644 --- a/src/USER-INTEL/pair_gayberne_intel.h +++ b/src/USER-INTEL/pair_gayberne_intel.h @@ -62,7 +62,10 @@ class PairGayBerneIntel : public PairGayBerne { typedef struct { flt_t lj3, lj4; } fc_packed2; typedef struct { flt_t shape2[4], well[4]; } fc_packed3; - __declspec(align(64)) flt_t special_lj[4], gamma, upsilon, mu; + _alignvar(flt_t special_lj[4],64); + _alignvar(flt_t gamma,64); + _alignvar(flt_t upsilon,64); + _alignvar(flt_t mu,64); fc_packed1 **ijc; fc_packed2 **lj34; fc_packed3 *ic; diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp index 576d5b21c7..8d23e8f589 100644 --- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp @@ -143,25 +143,25 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, const int ago = neighbor->ago; IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall); - ATOM_T * restrict const x = buffers->get_x(offload); - flt_t * restrict const q = buffers->get_q(offload); + ATOM_T * _noalias const x = buffers->get_x(offload); + flt_t * _noalias const q = buffers->get_q(offload); - const int * restrict const numneigh = list->numneigh; - const int * restrict const cnumneigh = buffers->cnumneigh(list); - const int * restrict const firstneigh = buffers->firstneigh(list); + const int * _noalias const numneigh = list->numneigh; + const int * _noalias const cnumneigh = buffers->cnumneigh(list); + const int * _noalias const firstneigh = buffers->firstneigh(list); - const flt_t * restrict const special_coul = fc.special_coul; - const flt_t * restrict const special_lj = fc.special_lj; + const flt_t * _noalias const special_coul = fc.special_coul; + const flt_t * _noalias const special_lj = fc.special_lj; const flt_t qqrd2e = force->qqrd2e; const flt_t inv_denom_lj = (flt_t)1.0/denom_lj; - const flt_t * restrict const cutsq = fc.cutsq[0]; - const LJ_T * restrict const lj = fc.lj[0]; - const TABLE_T * restrict const table = fc.table; - const flt_t * restrict const etable = fc.etable; - const flt_t * restrict const detable = fc.detable; - const flt_t * restrict const ctable = fc.ctable; - const flt_t * restrict const dctable = fc.dctable; + const flt_t * _noalias const cutsq = fc.cutsq[0]; + const LJ_T * _noalias const lj = fc.lj[0]; + const TABLE_T * _noalias const table = fc.table; + const flt_t * _noalias const etable = fc.etable; + const flt_t * _noalias const detable = fc.detable; + const flt_t * _noalias const ctable = fc.ctable; + const flt_t * _noalias const dctable = fc.dctable; const flt_t cut_ljsq = fc.cut_ljsq; const flt_t cut_lj_innersq = fc.cut_lj_innersq; const flt_t cut_coulsq = fc.cut_coulsq; @@ -178,8 +178,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, x_size, q_size, ev_size, f_stride); int tc; - FORCE_T * restrict f_start; - acc_t * restrict ev_global; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global); const int nthreads = tc; @@ -242,7 +242,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, iifrom += astart; iito += astart; - FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride); + FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride); memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); flt_t cutboth = cut_coulsq; @@ -251,10 +251,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, const int itype = x[i].w; const int ptr_off = itype * ntypes; - const flt_t * restrict const cutsqi = cutsq + ptr_off; - const LJ_T * restrict const lji = lj + ptr_off; + const flt_t * _noalias const cutsqi = cutsq + ptr_off; + const LJ_T * _noalias const lji = lj + ptr_off; - const int * restrict const jlist = firstneigh + cnumneigh[i]; + const int * _noalias const jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; acc_t fxtmp,fytmp,fztmp,fwtmp; @@ -270,9 +270,11 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; } + #if defined(__INTEL_COMPILER) #pragma vector aligned #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ sv0, sv1, sv2, sv3, sv4, sv5) + #endif for (int jj = 0; jj < jnum; jj++) { flt_t forcecoul, forcelj, evdwl, ecoul; forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0; diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h index ad66c786b6..dbdd6982ab 100644 --- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h +++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h @@ -62,8 +62,8 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong { class ForceConst { public: typedef struct { flt_t r, dr, f, df; } table_t; - __declspec(align(64)) flt_t special_coul[4]; - __declspec(align(64)) flt_t special_lj[4]; + _alignvar(flt_t special_coul[4],64); + _alignvar(flt_t special_lj[4],64); flt_t **cutsq, g_ewald, tabinnersq; flt_t cut_coulsq, cut_ljsq; flt_t cut_lj_innersq; diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp index 4163a1f7d2..582ad7eb85 100644 --- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp @@ -143,24 +143,24 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, const int ago = neighbor->ago; IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall); - ATOM_T * restrict const x = buffers->get_x(offload); - flt_t * restrict const q = buffers->get_q(offload); + ATOM_T * _noalias const x = buffers->get_x(offload); + flt_t * _noalias const q = buffers->get_q(offload); - const int * restrict const numneigh = list->numneigh; - const int * restrict const cnumneigh = buffers->cnumneigh(list); - const int * restrict const firstneigh = buffers->firstneigh(list); + const int * _noalias const numneigh = list->numneigh; + const int * _noalias const cnumneigh = buffers->cnumneigh(list); + const int * _noalias const firstneigh = buffers->firstneigh(list); - const flt_t * restrict const special_coul = fc.special_coul; - const flt_t * restrict const special_lj = fc.special_lj; + const flt_t * _noalias const special_coul = fc.special_coul; + const flt_t * _noalias const special_lj = fc.special_lj; const flt_t qqrd2e = force->qqrd2e; - const C_FORCE_T * restrict const c_force = fc.c_force[0]; - const C_ENERGY_T * restrict const c_energy = fc.c_energy[0]; - const TABLE_T * restrict const table = fc.table; - const flt_t * restrict const etable = fc.etable; - const flt_t * restrict const detable = fc.detable; - const flt_t * restrict const ctable = fc.ctable; - const flt_t * restrict const dctable = fc.dctable; + const C_FORCE_T * _noalias const c_force = fc.c_force[0]; + const C_ENERGY_T * _noalias const c_energy = fc.c_energy[0]; + const TABLE_T * _noalias const table = fc.table; + const flt_t * _noalias const etable = fc.etable; + const flt_t * _noalias const detable = fc.detable; + const flt_t * _noalias const ctable = fc.ctable; + const flt_t * _noalias const dctable = fc.dctable; const flt_t g_ewald = fc.g_ewald; const flt_t tabinnersq = fc.tabinnersq; @@ -174,8 +174,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, x_size, q_size, ev_size, f_stride); int tc; - FORCE_T * restrict f_start; - acc_t * restrict ev_global; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global); const int nthreads = tc; @@ -237,17 +237,17 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, iifrom += astart; iito += astart; - FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride); + FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride); memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); for (int i = iifrom; i < iito; ++i) { const int itype = x[i].w; const int ptr_off = itype * ntypes; - const C_FORCE_T * restrict const c_forcei = c_force + ptr_off; - const C_ENERGY_T * restrict const c_energyi = c_energy + ptr_off; + const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off; + const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off; - const int * restrict const jlist = firstneigh + cnumneigh[i]; + const int * _noalias const jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; acc_t fxtmp,fytmp,fztmp,fwtmp; @@ -263,9 +263,11 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; } + #if defined(__INTEL_COMPILER) #pragma vector aligned #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ sv0, sv1, sv2, sv3, sv4, sv5) + #endif for (int jj = 0; jj < jnum; jj++) { flt_t forcecoul, forcelj, evdwl, ecoul; forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0; diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h index d7b4282a99..641c35660c 100644 --- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h +++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h @@ -64,8 +64,8 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong { typedef struct { flt_t cutsq, cut_ljsq, lj1, lj2; } c_force_t; typedef struct { flt_t lj3, lj4, offset, pad; } c_energy_t; typedef struct { flt_t r, dr, f, df; } table_t; - __declspec(align(64)) flt_t special_coul[4]; - __declspec(align(64)) flt_t special_lj[4]; + _alignvar(flt_t special_coul[4],64); + _alignvar(flt_t special_lj[4],64); flt_t g_ewald, tabinnersq; c_force_t **c_force; c_energy_t **c_energy; diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp index bca3a73493..897abdde53 100644 --- a/src/USER-INTEL/pair_lj_cut_intel.cpp +++ b/src/USER-INTEL/pair_lj_cut_intel.cpp @@ -134,14 +134,14 @@ void PairLJCutIntel::eval(const int offload, const int vflag, const int ago = neighbor->ago; IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall); - ATOM_T * restrict const x = buffers->get_x(offload); + ATOM_T * _noalias const x = buffers->get_x(offload); - const int * restrict const numneigh = list->numneigh; - const int * restrict const cnumneigh = buffers->cnumneigh(list); - const int * restrict const firstneigh = buffers->firstneigh(list); - const flt_t * restrict const special_lj = fc.special_lj; - const FC_PACKED1_T * restrict const ljc12o = fc.ljc12o[0]; - const FC_PACKED2_T * restrict const lj34 = fc.lj34[0]; + const int * _noalias const numneigh = list->numneigh; + const int * _noalias const cnumneigh = buffers->cnumneigh(list); + const int * _noalias const firstneigh = buffers->firstneigh(list); + const flt_t * _noalias const special_lj = fc.special_lj; + const FC_PACKED1_T * _noalias const ljc12o = fc.ljc12o[0]; + const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0]; const int ntypes = atom->ntypes + 1; const int eatom = this->eflag_atom; @@ -153,8 +153,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag, x_size, q_size, ev_size, f_stride); int tc; - FORCE_T * restrict f_start; - acc_t * restrict ev_global; + FORCE_T * _noalias f_start; + acc_t * _noalias ev_global; IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global); const int nthreads = tc; int *overflow = fix->get_off_overflow_flag(); @@ -184,17 +184,17 @@ void PairLJCutIntel::eval(const int offload, const int vflag, iifrom += astart; iito += astart; - FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride); + FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride); memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); for (int i = iifrom; i < iito; ++i) { const int itype = x[i].w; const int ptr_off = itype * ntypes; - const FC_PACKED1_T * restrict const ljc12oi = ljc12o + ptr_off; - const FC_PACKED2_T * restrict const lj34i = lj34 + ptr_off; + const FC_PACKED1_T * _noalias const ljc12oi = ljc12o + ptr_off; + const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off; - const int * restrict const jlist = firstneigh + cnumneigh[i]; + const int * _noalias const jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; acc_t fxtmp, fytmp, fztmp, fwtmp; @@ -209,9 +209,11 @@ void PairLJCutIntel::eval(const int offload, const int vflag, if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; } + #if defined(__INTEL_COMPILER) #pragma vector aligned #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ sv0, sv1, sv2, sv3, sv4, sv5) + #endif for (int jj = 0; jj < jnum; jj++) { flt_t forcelj, evdwl; forcelj = evdwl = (flt_t)0.0; diff --git a/src/USER-INTEL/pair_lj_cut_intel.h b/src/USER-INTEL/pair_lj_cut_intel.h index a40e39af56..13737affab 100644 --- a/src/USER-INTEL/pair_lj_cut_intel.h +++ b/src/USER-INTEL/pair_lj_cut_intel.h @@ -62,7 +62,7 @@ class PairLJCutIntel : public PairLJCut { typedef struct { flt_t cutsq, lj1, lj2, offset; } fc_packed1; typedef struct { flt_t lj3, lj4; } fc_packed2; - __declspec(align(64)) flt_t special_lj[4]; + _alignvar(flt_t special_lj[4],64); fc_packed1 **ljc12o; fc_packed2 **lj34; diff --git a/src/USER-INTEL/verlet_intel.cpp b/src/USER-INTEL/verlet_intel.cpp index 64177e0f05..5bfd04639c 100644 --- a/src/USER-INTEL/verlet_intel.cpp +++ b/src/USER-INTEL/verlet_intel.cpp @@ -17,6 +17,7 @@ #include "domain.h" #include "comm.h" #include "atom.h" +#include "atom_vec.h" #include "force.h" #include "pair.h" #include "bond.h" @@ -81,14 +82,9 @@ void VerletIntel::init() // set flags for what arrays to clear in force_clear() // need to clear additionals arrays if they exist - torqueflag = 0; + torqueflag = extraflag = 0; if (atom->torque_flag) torqueflag = 1; - erforceflag = 0; - if (atom->erforce_flag) erforceflag = 1; - e_flag = 0; - if (atom->e_flag) e_flag = 1; - rho_flag = 0; - if (atom->rho_flag) rho_flag = 1; + if (atom->avec->forceclearflag) extraflag = 1; // orthogonal vs triclinic simulation box @@ -388,7 +384,7 @@ void VerletIntel::cleanup() void VerletIntel::force_clear() { - int i; + size_t nbytes; if (external_force_clear) return; @@ -396,19 +392,16 @@ void VerletIntel::force_clear() // if either newton flag is set, also include ghosts // when using threads always clear all forces. - if (neighbor->includegroup == 0) { - int nall; - if (force->newton) nall = atom->nlocal + atom->nghost; - else nall = atom->nlocal; + int nlocal = atom->nlocal; - size_t nbytes = sizeof(double) * nall; + if (neighbor->includegroup == 0) { + nbytes = sizeof(double) * nlocal; + if (force->newton) nbytes += sizeof(double) * atom->nghost; if (nbytes) { - memset(&(atom->f[0][0]),0,3*nbytes); - if (torqueflag) memset(&(atom->torque[0][0]),0,3*nbytes); - if (erforceflag) memset(&(atom->erforce[0]), 0, nbytes); - if (e_flag) memset(&(atom->de[0]), 0, nbytes); - if (rho_flag) memset(&(atom->drho[0]), 0, nbytes); + memset(&atom->f[0][0],0,3*nbytes); + if (torqueflag) memset(&atom->torque[0][0],0,3*nbytes); + if (extraflag) atom->avec->force_clear(0,nbytes); } // neighbor includegroup flag is set @@ -416,70 +409,21 @@ void VerletIntel::force_clear() // if either newton flag is set, also include ghosts } else { - int nall = atom->nfirst; + nbytes = sizeof(double) * atom->nfirst; - double **f = atom->f; - for (i = 0; i < nall; i++) { - f[i][0] = 0.0; - f[i][1] = 0.0; - f[i][2] = 0.0; - } - - if (torqueflag) { - double **torque = atom->torque; - for (i = 0; i < nall; i++) { - torque[i][0] = 0.0; - torque[i][1] = 0.0; - torque[i][2] = 0.0; - } - } - - if (erforceflag) { - double *erforce = atom->erforce; - for (i = 0; i < nall; i++) erforce[i] = 0.0; - } - - if (e_flag) { - double *de = atom->de; - for (i = 0; i < nall; i++) de[i] = 0.0; - } - - if (rho_flag) { - double *drho = atom->drho; - for (i = 0; i < nall; i++) drho[i] = 0.0; + if (nbytes) { + memset(&atom->f[0][0],0,3*nbytes); + if (torqueflag) memset(&atom->torque[0][0],0,3*nbytes); + if (extraflag) atom->avec->force_clear(0,nbytes); } if (force->newton) { - nall = atom->nlocal + atom->nghost; + nbytes = sizeof(double) * atom->nghost; - for (i = atom->nlocal; i < nall; i++) { - f[i][0] = 0.0; - f[i][1] = 0.0; - f[i][2] = 0.0; - } - - if (torqueflag) { - double **torque = atom->torque; - for (i = atom->nlocal; i < nall; i++) { - torque[i][0] = 0.0; - torque[i][1] = 0.0; - torque[i][2] = 0.0; - } - } - - if (erforceflag) { - double *erforce = atom->erforce; - for (i = atom->nlocal; i < nall; i++) erforce[i] = 0.0; - } - - if (e_flag) { - double *de = atom->de; - for (i = 0; i < nall; i++) de[i] = 0.0; - } - - if (rho_flag) { - double *drho = atom->drho; - for (i = 0; i < nall; i++) drho[i] = 0.0; + if (nbytes) { + memset(&atom->f[nlocal][0],0,3*nbytes); + if (torqueflag) memset(&atom->torque[nlocal][0],0,3*nbytes); + if (extraflag) atom->avec->force_clear(nlocal,nbytes); } } } diff --git a/src/USER-INTEL/verlet_intel.h b/src/USER-INTEL/verlet_intel.h index de4231431d..166dc137d0 100644 --- a/src/USER-INTEL/verlet_intel.h +++ b/src/USER-INTEL/verlet_intel.h @@ -39,8 +39,7 @@ class VerletIntel : public Integrate { protected: int triclinic; // 0 if domain is orthog, 1 if triclinic - int torqueflag,erforceflag; - int e_flag,rho_flag; + int torqueflag,extraflag; virtual void force_clear(); #ifdef _LMP_INTEL_OFFLOAD diff --git a/src/USER-INTEL/verlet_split_intel.cpp b/src/USER-INTEL/verlet_split_intel.cpp index 3976607b18..2eab0078d7 100644 --- a/src/USER-INTEL/verlet_split_intel.cpp +++ b/src/USER-INTEL/verlet_split_intel.cpp @@ -52,6 +52,9 @@ VerletSplitIntel::VerletSplitIntel(LAMMPS *lmp, int narg, char **arg) : if (universe->procs_per_world[0] % universe->procs_per_world[1]) error->universe_all(FLERR,"Verlet/split requires Rspace partition " "size be multiple of Kspace partition size"); + if (comm->style != 0) + error->universe_all(FLERR,"Verlet/split can only currently be used with " + "comm_style brick"); // master = 1 for Rspace procs, 0 for Kspace procs @@ -214,6 +217,9 @@ VerletSplitIntel::~VerletSplitIntel() void VerletSplitIntel::init() { + if (comm->style != 0) + error->universe_all(FLERR,"Verlet/split can only currently be used with " + "comm_style brick"); if (!force->kspace && comm->me == 0) error->warning(FLERR,"No Kspace calculation with verlet/split"); diff --git a/src/USER-MISC/pair_list.cpp b/src/USER-MISC/pair_list.cpp index 880cab6c31..2895b8216b 100644 --- a/src/USER-MISC/pair_list.cpp +++ b/src/USER-MISC/pair_list.cpp @@ -48,11 +48,6 @@ static double mypow(double x, int n) { } typedef struct { double x,y,z; } dbl3_t; -#if defined(__GNUC__) -#define _noalias __restrict -#else -#define _noalias -#endif /* ---------------------------------------------------------------------- */ diff --git a/src/USER-OMP/domain_omp.cpp b/src/USER-OMP/domain_omp.cpp index d5468863cc..584e56b1f0 100644 --- a/src/USER-OMP/domain_omp.cpp +++ b/src/USER-OMP/domain_omp.cpp @@ -22,11 +22,6 @@ using namespace LAMMPS_NS; typedef struct { double x,y,z; } dbl3_t; -#if defined(__GNUC__) -#define _noalias __restrict -#else -#define _noalias -#endif /* ---------------------------------------------------------------------- enforce PBC and modify box image flags for each atom diff --git a/src/USER-OMP/fix_nh_asphere_omp.cpp b/src/USER-OMP/fix_nh_asphere_omp.cpp index 28a4079ead..4286140a43 100644 --- a/src/USER-OMP/fix_nh_asphere_omp.cpp +++ b/src/USER-OMP/fix_nh_asphere_omp.cpp @@ -33,11 +33,6 @@ using namespace FixConst; enum{NOBIAS,BIAS}; typedef struct { double x,y,z; } dbl3_t; -#if defined(__GNUC__) -#define _noalias __restrict -#else -#define _noalias -#endif /* ---------------------------------------------------------------------- */ diff --git a/src/USER-OMP/fix_nh_omp.cpp b/src/USER-OMP/fix_nh_omp.cpp index 4bcde85e03..591503d01e 100644 --- a/src/USER-OMP/fix_nh_omp.cpp +++ b/src/USER-OMP/fix_nh_omp.cpp @@ -34,11 +34,6 @@ enum{ISO,ANISO,TRICLINIC}; #define TILTMAX 1.5 typedef struct { double x,y,z; } dbl3_t; -#if defined(__GNUC__) -#define _noalias __restrict -#else -#define _noalias -#endif /* ---------------------------------------------------------------------- change box size diff --git a/src/USER-OMP/fix_nh_sphere_omp.cpp b/src/USER-OMP/fix_nh_sphere_omp.cpp index a180e7270f..79d98a6cca 100644 --- a/src/USER-OMP/fix_nh_sphere_omp.cpp +++ b/src/USER-OMP/fix_nh_sphere_omp.cpp @@ -31,11 +31,6 @@ enum{NOBIAS,BIAS}; #define INERTIA 0.4 // moment of inertia prefactor for sphere typedef struct { double x,y,z; } dbl3_t; -#if defined(__GNUC__) -#define _noalias __restrict -#else -#define _noalias -#endif /* ---------------------------------------------------------------------- */ diff --git a/src/USER-OMP/fix_nve_omp.cpp b/src/USER-OMP/fix_nve_omp.cpp index dd0ae12a5c..9e132aa539 100644 --- a/src/USER-OMP/fix_nve_omp.cpp +++ b/src/USER-OMP/fix_nve_omp.cpp @@ -19,11 +19,6 @@ using namespace LAMMPS_NS; using namespace FixConst; typedef struct { double x,y,z; } dbl3_t; -#if defined(__GNUC__) -#define _noalias __restrict -#else -#define _noalias -#endif /* ---------------------------------------------------------------------- */ diff --git a/src/USER-OMP/fix_nvt_sllod_omp.cpp b/src/USER-OMP/fix_nvt_sllod_omp.cpp index a3d1c5b04d..0ce946e769 100644 --- a/src/USER-OMP/fix_nvt_sllod_omp.cpp +++ b/src/USER-OMP/fix_nvt_sllod_omp.cpp @@ -34,11 +34,6 @@ using namespace FixConst; enum{NO_REMAP,X_REMAP,V_REMAP}; // same as fix_deform.cpp typedef struct { double x,y,z; } dbl3_t; -#if defined(__GNUC__) -#define _noalias __restrict -#else -#define _noalias -#endif /* ---------------------------------------------------------------------- */ diff --git a/src/USER-OMP/fix_rigid_nh_omp.cpp b/src/USER-OMP/fix_rigid_nh_omp.cpp index 022a09d6a1..1b16657c9c 100644 --- a/src/USER-OMP/fix_rigid_nh_omp.cpp +++ b/src/USER-OMP/fix_rigid_nh_omp.cpp @@ -48,11 +48,6 @@ enum{ISO,ANISO,TRICLINIC}; // same as in FixRigid #define EINERTIA 0.4 // moment of inertia prefactor for ellipsoid typedef struct { double x,y,z; } dbl3_t; -#if defined(__GNUC__) -#define _noalias __restrict -#else -#define _noalias -#endif /* ---------------------------------------------------------------------- perform preforce velocity Verlet integration diff --git a/src/USER-OMP/fix_rigid_omp.cpp b/src/USER-OMP/fix_rigid_omp.cpp index 7ffa741185..f3b16a5ecc 100644 --- a/src/USER-OMP/fix_rigid_omp.cpp +++ b/src/USER-OMP/fix_rigid_omp.cpp @@ -42,11 +42,6 @@ enum{SINGLE,MOLECULE,GROUP}; // same as in FixRigid #define EINERTIA 0.4 // moment of inertia prefactor for ellipsoid typedef struct { double x,y,z; } dbl3_t; -#if defined(__GNUC__) -#define _noalias __restrict -#else -#define _noalias -#endif /* ---------------------------------------------------------------------- */ diff --git a/src/USER-OMP/fix_rigid_small_omp.cpp b/src/USER-OMP/fix_rigid_small_omp.cpp index 292faba919..c6b986da2d 100644 --- a/src/USER-OMP/fix_rigid_small_omp.cpp +++ b/src/USER-OMP/fix_rigid_small_omp.cpp @@ -42,11 +42,6 @@ using namespace MathConst; enum{FULL_BODY,INITIAL,FINAL,FORCE_TORQUE,VCM_ANGMOM,XCM_MASS,ITENSOR,DOF}; typedef struct { double x,y,z; } dbl3_t; -#if defined(__GNUC__) -#define _noalias __restrict -#else -#define _noalias -#endif /* ---------------------------------------------------------------------- */ diff --git a/src/USER-OMP/thr_omp.h b/src/USER-OMP/thr_omp.h index 9348058d87..c5c6094ee6 100644 --- a/src/USER-OMP/thr_omp.h +++ b/src/USER-OMP/thr_omp.h @@ -195,14 +195,4 @@ typedef struct { int a,b,c,d,t; } int5_t; } -#ifdef _noalias -#undef _noalias -#endif - -#if defined(__GNUC__) -#define _noalias __restrict -#else -#define _noalias -#endif - #endif diff --git a/src/lmptype.h b/src/lmptype.h index a03c8601ad..87b7da51ed 100644 --- a/src/lmptype.h +++ b/src/lmptype.h @@ -167,6 +167,36 @@ typedef int bigint; } +// preprocessor macros for compiler specific settings +// clear previous definitions to avoid redefinition warning + +#ifdef _alignvar +#undef _alignvar +#endif +#ifdef _noalias +#undef _noalias +#endif + +// define stack variable alignment + +#if defined(__INTEL_COMPILER) +#define _alignvar(expr,val) __declspec(align(val)) expr +#elif defined(__GNUC__) +#define _alignvar(expr,val) expr __attribute((aligned(val))) +#else +#define _alignvar(expr,val) expr +#endif + +// declaration to lift aliasing restrictions + +#if defined(__INTEL_COMPILER) +#define _noalias restrict +#elif defined(__GNUC__) +#define _noalias __restrict +#else +#define _noalias +#endif + // settings to enable LAMMPS to build under Windows #ifdef _WIN32 diff --git a/src/lmpwindows.h b/src/lmpwindows.h index 50f64eafb2..419aa9aff5 100644 --- a/src/lmpwindows.h +++ b/src/lmpwindows.h @@ -15,7 +15,6 @@ #define ATOBIGINT _atoi64 #define pclose _pclose -#define __restrict__ __restrict // the following functions ared defined to get rid of // 'ambiguous call to overloaded function' error in VSS for mismathched type arguments