lammps/src/USER-OMP/thr_omp.cpp

393 lines
12 KiB
C++

/* -------------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
OpenMP based threading support for LAMMPS
Contributing author: Axel Kohlmeyer (Temple U)
------------------------------------------------------------------------- */
#include "thr_omp.h"
#include "memory.h"
#include "atom.h"
#include "comm.h"
#include "force.h"
#include "pair.h"
#include "dihedral.h"
#if defined(_OPENMP)
#include <omp.h>
#endif
using namespace LAMMPS_NS;
/* ---------------------------------------------------------------------- */
ThrOMP::ThrOMP(LAMMPS *ptr, int style) : thr_style(style), lmp(ptr)
{
// initialize fixed size per thread storage
eng_vdwl_thr = eng_coul_thr = eng_bond_thr = NULL;
virial_thr = NULL;
lmp->memory->create(eng_vdwl_thr,lmp->comm->nthreads,"thr_omp:eng_vdwl_thr");
lmp->memory->create(eng_coul_thr,lmp->comm->nthreads,"thr_omp:eng_coul_thr");
lmp->memory->create(eng_bond_thr,lmp->comm->nthreads,"thr_omp:eng_bond_thr");
lmp->memory->create(virial_thr,lmp->comm->nthreads,6,"thr_omp:virial_thr");
// variable size per thread, per atom storage
// the actually allocation happens via memory->grow() in ev_steup_thr()
maxeatom_thr = maxvatom_thr = 0;
eatom_thr = NULL;
vatom_thr = NULL;
}
/* ---------------------------------------------------------------------- */
ThrOMP::~ThrOMP()
{
lmp->memory->destroy(eng_vdwl_thr);
lmp->memory->destroy(eng_coul_thr);
lmp->memory->destroy(eng_bond_thr);
lmp->memory->destroy(virial_thr);
lmp->memory->destroy(eatom_thr);
lmp->memory->destroy(vatom_thr);
}
/* ---------------------------------------------------------------------- */
void ThrOMP::ev_zero_acc_thr(int ntotal, int eflag_global, int vflag_global,
int eflag_atom, int vflag_atom, int nthreads)
{
int t,i;
for (t = 0; t < nthreads; ++t) {
if (eflag_global)
eng_vdwl_thr[t] = eng_coul_thr[t] = eng_bond_thr[t] = 0.0;
if (vflag_global)
for (i = 0; i < 6; ++i)
virial_thr[t][i] = 0.0;
if (eflag_atom)
for (i = 0; i < ntotal; ++i)
eatom_thr[t][i] = 0.0;
if (vflag_atom)
for (i = 0; i < ntotal; ++i) {
vatom_thr[t][i][0] = 0.0;
vatom_thr[t][i][1] = 0.0;
vatom_thr[t][i][2] = 0.0;
vatom_thr[t][i][3] = 0.0;
vatom_thr[t][i][4] = 0.0;
vatom_thr[t][i][5] = 0.0;
}
}
}
/* ---------------------------------------------------------------------- */
void ThrOMP::ev_setup_thr(Dihedral *dihed)
{
int nthreads = lmp->comm->nthreads;
// reallocate per-atom arrays if necessary
if (dihed->eflag_atom && lmp->atom->nmax > maxeatom_thr) {
maxeatom_thr = lmp->atom->nmax;
lmp->memory->grow(eatom_thr,nthreads,maxeatom_thr,"thr_omp:eatom_thr");
}
if (dihed->vflag_atom && lmp->atom->nmax > maxvatom_thr) {
maxvatom_thr = lmp->atom->nmax;
lmp->memory->grow(vatom_thr,nthreads,maxeatom_thr,6,"thr_omp:vatom_thr");
}
int ntotal = (lmp->force->newton_bond) ?
(lmp->atom->nlocal + lmp->atom->nghost) : lmp->atom->nlocal;
// zero per thread accumulators
ev_zero_acc_thr(ntotal, dihed->eflag_global, dihed->vflag_global,
dihed->eflag_atom, dihed->vflag_atom, nthreads);
}
/* ---------------------------------------------------------------------- */
void ThrOMP::ev_setup_thr(Pair *pair)
{
int nthreads = lmp->comm->nthreads;
// reallocate per-atom arrays if necessary
if (pair->eflag_atom && lmp->atom->nmax > maxeatom_thr) {
maxeatom_thr = lmp->atom->nmax;
lmp->memory->grow(eatom_thr,nthreads,maxeatom_thr,"thr_omp:eatom_thr");
}
if (pair->vflag_atom && lmp->atom->nmax > maxvatom_thr) {
maxvatom_thr = lmp->atom->nmax;
lmp->memory->grow(vatom_thr,nthreads,maxeatom_thr,6,"thr_omp:vatom_thr");
}
int ntotal = (lmp->force->newton) ?
(lmp->atom->nlocal + lmp->atom->nghost) : lmp->atom->nlocal;
// zero per thread accumulators
ev_zero_acc_thr(ntotal, pair->eflag_global, pair->vflag_global,
pair->eflag_atom, pair->vflag_atom, nthreads);
}
/* ----------------------------------------------------------------------
reduce the per thread accumulated E/V data into the canonical accumulators.
------------------------------------------------------------------------- */
void ThrOMP::ev_reduce_thr(Dihedral *dihed)
{
int nthreads = lmp->comm->nthreads;
int ntotal = (lmp->force->newton_bond) ?
(lmp->atom->nlocal + lmp->atom->nghost) : lmp->atom->nlocal;
for (int n = 0; n < nthreads; ++n) {
dihed->energy += eng_bond_thr[n];
if (dihed->vflag_either) {
dihed->virial[0] += virial_thr[n][0];
dihed->virial[1] += virial_thr[n][1];
dihed->virial[2] += virial_thr[n][2];
dihed->virial[3] += virial_thr[n][3];
dihed->virial[4] += virial_thr[n][4];
dihed->virial[5] += virial_thr[n][5];
if (dihed->vflag_atom) {
for (int i = 0; i < ntotal; ++i) {
dihed->vatom[i][0] += vatom_thr[n][i][0];
dihed->vatom[i][1] += vatom_thr[n][i][1];
dihed->vatom[i][2] += vatom_thr[n][i][2];
dihed->vatom[i][3] += vatom_thr[n][i][3];
dihed->vatom[i][4] += vatom_thr[n][i][4];
dihed->vatom[i][5] += vatom_thr[n][i][5];
}
}
}
if (dihed->eflag_atom) {
for (int i = 0; i < ntotal; ++i) {
dihed->eatom[i] += eatom_thr[n][i];
}
}
}
}
/* ----------------------------------------------------------------------
tally eng_vdwl and virial into per thread global and per-atom accumulators
need i < nlocal test since called by bond_quartic and dihedral_charmm
------------------------------------------------------------------------- */
void ThrOMP::ev_tally_thr(Pair *pair, int i, int j, int nlocal,
int newton_pair, double evdwl, double ecoul,
double fpair, double delx, double dely,
double delz, int tid)
{
double evdwlhalf,ecoulhalf,epairhalf,v[6];
if (pair->eflag_either) {
if (pair->eflag_global) {
if (newton_pair) {
eng_vdwl_thr[tid] += evdwl;
eng_coul_thr[tid] += ecoul;
} else {
evdwlhalf = 0.5*evdwl;
ecoulhalf = 0.5*ecoul;
if (i < nlocal) {
eng_vdwl_thr[tid] += evdwlhalf;
eng_coul_thr[tid] += ecoulhalf;
}
if (j < nlocal) {
eng_vdwl_thr[tid] += evdwlhalf;
eng_coul_thr[tid] += ecoulhalf;
}
}
}
if (pair->eflag_atom) {
epairhalf = 0.5 * (evdwl + ecoul);
if (newton_pair || i < nlocal) eatom_thr[tid][i] += epairhalf;
if (newton_pair || j < nlocal) eatom_thr[tid][j] += epairhalf;
}
}
if (pair->vflag_either) {
v[0] = delx*delx*fpair;
v[1] = dely*dely*fpair;
v[2] = delz*delz*fpair;
v[3] = delx*dely*fpair;
v[4] = delx*delz*fpair;
v[5] = dely*delz*fpair;
if (pair->vflag_global) {
if (newton_pair) {
virial_thr[tid][0] += v[0];
virial_thr[tid][1] += v[1];
virial_thr[tid][2] += v[2];
virial_thr[tid][3] += v[3];
virial_thr[tid][4] += v[4];
virial_thr[tid][5] += v[5];
} else {
if (i < nlocal) {
virial_thr[tid][0] += 0.5*v[0];
virial_thr[tid][1] += 0.5*v[1];
virial_thr[tid][2] += 0.5*v[2];
virial_thr[tid][3] += 0.5*v[3];
virial_thr[tid][4] += 0.5*v[4];
virial_thr[tid][5] += 0.5*v[5];
}
if (j < nlocal) {
virial_thr[tid][0] += 0.5*v[0];
virial_thr[tid][1] += 0.5*v[1];
virial_thr[tid][2] += 0.5*v[2];
virial_thr[tid][3] += 0.5*v[3];
virial_thr[tid][4] += 0.5*v[4];
virial_thr[tid][5] += 0.5*v[5];
}
}
}
if (pair->vflag_atom) {
if (newton_pair || i < nlocal) {
vatom_thr[tid][i][0] += 0.5*v[0];
vatom_thr[tid][i][1] += 0.5*v[1];
vatom_thr[tid][i][2] += 0.5*v[2];
vatom_thr[tid][i][3] += 0.5*v[3];
vatom_thr[tid][i][4] += 0.5*v[4];
vatom_thr[tid][i][5] += 0.5*v[5];
}
if (newton_pair || j < nlocal) {
vatom_thr[tid][j][0] += 0.5*v[0];
vatom_thr[tid][j][1] += 0.5*v[1];
vatom_thr[tid][j][2] += 0.5*v[2];
vatom_thr[tid][j][3] += 0.5*v[3];
vatom_thr[tid][j][4] += 0.5*v[4];
vatom_thr[tid][j][5] += 0.5*v[5];
}
}
}
}
/* ----------------------------------------------------------------------
reduce the per thread accumulated E/V data into the canonical accumulators.
------------------------------------------------------------------------- */
void ThrOMP::ev_reduce_thr(Pair *pair)
{
const int nthreads = lmp->comm->nthreads;
const int ntotal = (lmp->force->newton) ?
(lmp->atom->nlocal + lmp->atom->nghost) : lmp->atom->nlocal;
for (int n = 0; n < nthreads; ++n) {
pair->eng_vdwl += eng_vdwl_thr[n];
pair->eng_coul += eng_coul_thr[n];
if (pair->vflag_either) {
pair->virial[0] += virial_thr[n][0];
pair->virial[1] += virial_thr[n][1];
pair->virial[2] += virial_thr[n][2];
pair->virial[3] += virial_thr[n][3];
pair->virial[4] += virial_thr[n][4];
pair->virial[5] += virial_thr[n][5];
if (pair->vflag_atom) {
for (int i = 0; i < ntotal; ++i) {
pair->vatom[i][0] += vatom_thr[n][i][0];
pair->vatom[i][1] += vatom_thr[n][i][1];
pair->vatom[i][2] += vatom_thr[n][i][2];
pair->vatom[i][3] += vatom_thr[n][i][3];
pair->vatom[i][4] += vatom_thr[n][i][4];
pair->vatom[i][5] += vatom_thr[n][i][5];
}
}
}
if (pair->eflag_atom) {
for (int i = 0; i < ntotal; ++i) {
pair->eatom[i] += eatom_thr[n][i];
}
}
}
}
/* ---------------------------------------------------------------------- */
// set loop range thread id, and force array offset for threaded runs.
double **ThrOMP::loop_setup_thr(double **f, int &ifrom, int &ito, int &tid,
int inum, int nall, int nthreads)
{
#if defined(_OPENMP)
if (nthreads > 1) {
tid = omp_get_thread_num();
// each thread works on a fixed chunk of atoms.
const int idelta = 1 + inum/nthreads;
ifrom = tid*idelta;
ito = ifrom + idelta;
if (ito > inum)
ito = inum;
return f + nall*tid;
} else {
#endif
tid = 0;
ifrom = 0;
ito = inum;
return f;
#if defined(_OPENMP)
}
#endif
}
/* ---------------------------------------------------------------------- */
// reduce per thread forces into the first part of the force
// array that is used for the non-threaded parts and reset
// the temporary storage to 0.0. this routine depends on the
// forces arrays stored in this order x1,y1,z1,x2,y2,z2,...
// we need to post a barrier to wait until all threads are done
// with computing forces.
void ThrOMP::force_reduce_thr(double *fall, int nall,
int nthreads, int tid)
{
#if defined(_OPENMP)
// NOOP in non-threaded execution.
if (nthreads == 1) return;
#pragma omp barrier
{
double *f;
const int idelta = 1 + nall/nthreads;
const int ifrom = 3*tid*idelta;
const int ito = 3*(((ifrom + idelta) > nall) ? nall : (ifrom + idelta));
for (int n = 1; n < nthreads; ++n) {
const int toffs = 3*n*nall;
f = fall + toffs;
for (int m = ifrom; m < ito; ++m) {
fall[m] += f[m];
f[m] = 0.0;
}
}
}
#else
// NOOP in non-threaded execution.
return;
#endif
}
/* ---------------------------------------------------------------------- */
double ThrOMP::memory_usage_thr()
{
const int nthreads=lmp->comm->nthreads;
double bytes = nthreads * (3 + 7) * sizeof(double);
bytes += nthreads * maxeatom_thr * sizeof(double);
bytes += nthreads * maxvatom_thr * 6 * sizeof(double);
return bytes;
}