git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6816 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-08-25 19:21:40 +00:00 · 2011-08-25 19:21:40 +00:00 · 1d743442b3
parent 62126a8f6d
commit 1d743442b3
9 changed files with 739 additions and 1 deletions
--- a/src/Makefile
+++ b/src/Makefile
@ -18,7 +18,7 @@ PACKAGE = asphere class2 colloid dipole gpu granular \
 	  shock srd xtc

 PACKUSER = user-misc user-atc user-awpmd user-cg-cmm \
-	   user-cuda user-eff user-ewaldn user-reaxc
+	   user-cuda user-eff user-ewaldn user-omp user-reaxc

 PACKALL = $(PACKAGE) $(PACKUSER)

--- a/src/USER-OMP/Install.sh
+++ b/src/USER-OMP/Install.sh
@ -0,0 +1,32 @@
+# Install/unInstall package files in LAMMPS
+# do not install child files if parent does not exist
+
+if (test $1 = 1) then
+
+#  if (test -e ../pair_lj_cut_coul_long.cpp) then
+#    cp pair_lj_cut_coul_long_omp.cpp ..
+#    cp pair_lj_cut_coul_long_omp.h ..
+#  fi
+
+  cp pair_lj_cut_omp.cpp ..
+
+  cp thr_omp.cpp ..
+
+  cp pair_lj_cut_omp.h ..
+
+  cp thr_omp.h ..
+
+elif (test $1 = 0) then
+
+#  rm -f ../pair_lj_cut_coul_long_omp.cpp
+  rm -f ../pair_lj_cut_omp.cpp
+
+  rm -f ../thr_omp.cpp
+
+#  rm -f ../pair_lj_cut_coul_long_omp.h
+  rm -f ../pair_lj_cut_omp.h
+
+  rm -f ../thr_omp.h
+
+fi
+
--- a/src/USER-OMP/Package.sh
+++ b/src/USER-OMP/Package.sh
@ -0,0 +1,21 @@
+#/bin/sh
+# Update package files in LAMMPS
+# copy package file to src if it doesn't exists or is different
+# do not copy gayberne files if non-GPU version does not exist
+for file in *_omp.cpp *_omp.h; do
+  # let us see if the "rain man" can count the toothpicks...
+  ofile=`echo $file | sed -e s,\\\\\\(.\\*\\\\\\)_omp\\\\.\\\\\\(h\\\\\\|cpp\\\\\\),\\\\1.\\\\2,`
+  if (test $file = "thr_omp.h") || (test $file = "thr_omp.cpp") then
+    :  # do check for those files.
+  elif (test ! -e ../$ofile) then
+    continue
+  fi
+  if (test ! -e ../$file) then
+    echo "  creating src/$file"
+    cp $file ..
+  elif ! cmp -s $file ../$file ; then
+    echo "  updating src/$file"
+    cp $file ..
+  fi
+done
+
--- a/src/USER-OMP/pair_lj_cut_omp.cpp
+++ b/src/USER-OMP/pair_lj_cut_omp.cpp
@ -0,0 +1,163 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_cut_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutOMP::PairLJCutOMP(LAMMPS *lmp) :
+  PairLJCut(lmp), ThrOMP(lmp, PAIR)
+{
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+    ev_setup_thr(this);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(shared)
+#endif
+  {
+    int ifrom, ito, tid;
+    double **f;
+
+    f = loop_setup_thr(atom->f, ifrom, ito, tid, inum, nall, nthreads);
+
+    if (evflag) {
+      if (eflag) {
+	if (force->newton_pair) eval<1,1,1>(f, ifrom, ito, tid);
+	else eval<1,1,0>(f, ifrom, ito, tid);
+      } else {
+	if (force->newton_pair) eval<1,0,1>(f, ifrom, ito, tid);
+	else eval<1,0,0>(f, ifrom, ito, tid);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(f, ifrom, ito, tid);
+      else eval<0,0,0>(f, ifrom, ito, tid);
+    }
+
+    // reduce per thread forces into global force array.
+    force_reduce_thr(&(atom->f[0][0]), nall, nthreads, tid);
+  } // end of omp parallel region
+
+  // reduce per thread energy and virial, if requested.
+  if (evflag) ev_reduce_thr(this);
+  if (vflag_fdotr) virial_fdotr_compute();
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJCutOMP::eval(double **f, int iifrom, int iito, int tid)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,forcelj,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+	r2inv = 1.0/rsq;
+	r6inv = r2inv*r2inv*r2inv;
+	forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+	fpair = factor_lj*forcelj*r2inv;
+
+	fxtmp += delx*fpair;
+	fytmp += dely*fpair;
+	fztmp += delz*fpair;
+	if (NEWTON_PAIR || j < nlocal) {
+	  f[j][0] -= delx*fpair;
+	  f[j][1] -= dely*fpair;
+	  f[j][2] -= delz*fpair;
+	}
+
+	if (EFLAG) {
+	  evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype])
+	    - offset[itype][jtype];
+	  evdwl *= factor_lj;
+	}
+
+	if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+				 evdwl,0.0,fpair,delx,dely,delz,tid);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCutOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJCut::memory_usage();
+
+  return bytes;
+}
--- a/src/USER-OMP/pair_lj_cut_omp.h
+++ b/src/USER-OMP/pair_lj_cut_omp.h
@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/omp,PairLJCutOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_OMP_H
+#define LMP_PAIR_LJ_CUT_OMP_H
+
+#include "pair_lj_cut.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutOMP : public PairLJCut, public ThrOMP {
+
+ public:
+  PairLJCutOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(double **f, int ifrom, int ito, int tid);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/thr_omp.cpp
+++ b/src/USER-OMP/thr_omp.cpp
@ -0,0 +1,392 @@
+/* -------------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   OpenMP based threading support for LAMMPS
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "thr_omp.h"
+
+#include "memory.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+
+#include "pair.h"
+#include "dihedral.h"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+ThrOMP::ThrOMP(LAMMPS *ptr, int style) : thr_style(style), lmp(ptr)
+{
+  // initialize fixed size per thread storage
+  eng_vdwl_thr = eng_coul_thr = eng_bond_thr = NULL;
+  virial_thr = NULL;
+  lmp->memory->create(eng_vdwl_thr,lmp->comm->nthreads,"thr_omp:eng_vdwl_thr");
+  lmp->memory->create(eng_coul_thr,lmp->comm->nthreads,"thr_omp:eng_coul_thr");
+  lmp->memory->create(eng_bond_thr,lmp->comm->nthreads,"thr_omp:eng_bond_thr");
+  lmp->memory->create(virial_thr,lmp->comm->nthreads,6,"thr_omp:virial_thr");
+
+  // variable size per thread, per atom storage
+  // the actually allocation happens via memory->grow() in ev_steup_thr()
+  maxeatom_thr = maxvatom_thr = 0;
+  eatom_thr = NULL;
+  vatom_thr = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+ThrOMP::~ThrOMP()
+{
+  lmp->memory->destroy(eng_vdwl_thr);
+  lmp->memory->destroy(eng_coul_thr);
+  lmp->memory->destroy(eng_bond_thr);
+  lmp->memory->destroy(virial_thr);
+  lmp->memory->destroy(eatom_thr);
+  lmp->memory->destroy(vatom_thr);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ThrOMP::ev_zero_acc_thr(int ntotal, int eflag_global, int vflag_global,
+			     int eflag_atom, int vflag_atom, int nthreads)
+{
+  int t,i;
+  
+  for (t = 0; t < nthreads; ++t) {
+
+    if (eflag_global) 
+      eng_vdwl_thr[t] = eng_coul_thr[t] = eng_bond_thr[t] = 0.0;
+
+    if (vflag_global) 
+      for (i = 0; i < 6; ++i)
+	virial_thr[t][i] = 0.0;
+
+    if (eflag_atom)
+      for (i = 0; i < ntotal; ++i)
+	eatom_thr[t][i] = 0.0;
+    
+    if (vflag_atom)
+      for (i = 0; i < ntotal; ++i) {
+        vatom_thr[t][i][0] = 0.0;
+        vatom_thr[t][i][1] = 0.0;
+        vatom_thr[t][i][2] = 0.0;
+        vatom_thr[t][i][3] = 0.0;
+        vatom_thr[t][i][4] = 0.0;
+        vatom_thr[t][i][5] = 0.0;
+      }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ThrOMP::ev_setup_thr(Dihedral *dihed)
+{
+  int nthreads = lmp->comm->nthreads;
+
+  // reallocate per-atom arrays if necessary
+  if (dihed->eflag_atom && lmp->atom->nmax > maxeatom_thr) {
+    maxeatom_thr = lmp->atom->nmax;
+    lmp->memory->grow(eatom_thr,nthreads,maxeatom_thr,"thr_omp:eatom_thr");
+  }
+  if (dihed->vflag_atom && lmp->atom->nmax > maxvatom_thr) {
+    maxvatom_thr = lmp->atom->nmax;
+    lmp->memory->grow(vatom_thr,nthreads,maxeatom_thr,6,"thr_omp:vatom_thr");
+  }
+
+  int ntotal = (lmp->force->newton_bond) ? 
+    (lmp->atom->nlocal + lmp->atom->nghost) : lmp->atom->nlocal;
+
+  // zero per thread accumulators
+  ev_zero_acc_thr(ntotal, dihed->eflag_global, dihed->vflag_global,
+		  dihed->eflag_atom, dihed->vflag_atom, nthreads);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ThrOMP::ev_setup_thr(Pair *pair)
+{
+  int nthreads = lmp->comm->nthreads;
+
+  // reallocate per-atom arrays if necessary
+  if (pair->eflag_atom && lmp->atom->nmax > maxeatom_thr) {
+    maxeatom_thr = lmp->atom->nmax;
+    lmp->memory->grow(eatom_thr,nthreads,maxeatom_thr,"thr_omp:eatom_thr");
+  }
+  if (pair->vflag_atom && lmp->atom->nmax > maxvatom_thr) {
+    maxvatom_thr = lmp->atom->nmax;
+    lmp->memory->grow(vatom_thr,nthreads,maxeatom_thr,6,"thr_omp:vatom_thr");
+  }
+
+  int ntotal = (lmp->force->newton) ?
+    (lmp->atom->nlocal + lmp->atom->nghost) : lmp->atom->nlocal;
+
+  // zero per thread accumulators
+  ev_zero_acc_thr(ntotal, pair->eflag_global, pair->vflag_global,
+		  pair->eflag_atom, pair->vflag_atom, nthreads);
+}
+
+/* ----------------------------------------------------------------------
+   reduce the per thread accumulated E/V data into the canonical accumulators.
+------------------------------------------------------------------------- */
+void ThrOMP::ev_reduce_thr(Dihedral *dihed)
+{
+  int nthreads = lmp->comm->nthreads;
+  int ntotal = (lmp->force->newton_bond) ?
+    (lmp->atom->nlocal + lmp->atom->nghost) : lmp->atom->nlocal;
+
+  for (int n = 0; n < nthreads; ++n) {
+    dihed->energy += eng_bond_thr[n];
+    if (dihed->vflag_either) {
+      dihed->virial[0] += virial_thr[n][0];
+      dihed->virial[1] += virial_thr[n][1];
+      dihed->virial[2] += virial_thr[n][2];
+      dihed->virial[3] += virial_thr[n][3];
+      dihed->virial[4] += virial_thr[n][4];
+      dihed->virial[5] += virial_thr[n][5];
+      if (dihed->vflag_atom) {
+        for (int i = 0; i < ntotal; ++i) {
+          dihed->vatom[i][0] += vatom_thr[n][i][0];
+          dihed->vatom[i][1] += vatom_thr[n][i][1];
+          dihed->vatom[i][2] += vatom_thr[n][i][2];
+          dihed->vatom[i][3] += vatom_thr[n][i][3];
+          dihed->vatom[i][4] += vatom_thr[n][i][4];
+          dihed->vatom[i][5] += vatom_thr[n][i][5];
+        }
+      }
+    }
+    if (dihed->eflag_atom) {
+      for (int i = 0; i < ntotal; ++i) {
+        dihed->eatom[i] += eatom_thr[n][i];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   tally eng_vdwl and virial into per thread global and per-atom accumulators
+   need i < nlocal test since called by bond_quartic and dihedral_charmm
+------------------------------------------------------------------------- */
+
+void ThrOMP::ev_tally_thr(Pair *pair, int i, int j, int nlocal,
+			  int newton_pair, double evdwl, double ecoul,
+			  double fpair, double delx, double dely,
+			  double delz, int tid)
+{
+  double evdwlhalf,ecoulhalf,epairhalf,v[6];
+
+  if (pair->eflag_either) {
+    if (pair->eflag_global) {
+      if (newton_pair) {
+	eng_vdwl_thr[tid] += evdwl;
+	eng_coul_thr[tid] += ecoul;
+      } else {
+	evdwlhalf = 0.5*evdwl;
+	ecoulhalf = 0.5*ecoul;
+	if (i < nlocal) {
+	  eng_vdwl_thr[tid] += evdwlhalf;
+	  eng_coul_thr[tid] += ecoulhalf;
+	}
+	if (j < nlocal) {
+	  eng_vdwl_thr[tid] += evdwlhalf;
+	  eng_coul_thr[tid] += ecoulhalf;
+	}
+      }
+    }
+    if (pair->eflag_atom) {
+      epairhalf = 0.5 * (evdwl + ecoul);
+      if (newton_pair || i < nlocal) eatom_thr[tid][i] += epairhalf;
+      if (newton_pair || j < nlocal) eatom_thr[tid][j] += epairhalf;
+    }
+  }
+
+  if (pair->vflag_either) {
+    v[0] = delx*delx*fpair;
+    v[1] = dely*dely*fpair;
+    v[2] = delz*delz*fpair;
+    v[3] = delx*dely*fpair;
+    v[4] = delx*delz*fpair;
+    v[5] = dely*delz*fpair;
+
+    if (pair->vflag_global) {
+      if (newton_pair) {
+	virial_thr[tid][0] += v[0];
+	virial_thr[tid][1] += v[1];
+	virial_thr[tid][2] += v[2];
+	virial_thr[tid][3] += v[3];
+	virial_thr[tid][4] += v[4];
+	virial_thr[tid][5] += v[5];
+      } else {
+	if (i < nlocal) {
+	  virial_thr[tid][0] += 0.5*v[0];
+	  virial_thr[tid][1] += 0.5*v[1];
+	  virial_thr[tid][2] += 0.5*v[2];
+	  virial_thr[tid][3] += 0.5*v[3];
+	  virial_thr[tid][4] += 0.5*v[4];
+	  virial_thr[tid][5] += 0.5*v[5];
+	}
+	if (j < nlocal) {
+	  virial_thr[tid][0] += 0.5*v[0];
+	  virial_thr[tid][1] += 0.5*v[1];
+	  virial_thr[tid][2] += 0.5*v[2];
+	  virial_thr[tid][3] += 0.5*v[3];
+	  virial_thr[tid][4] += 0.5*v[4];
+	  virial_thr[tid][5] += 0.5*v[5];
+	}
+      }
+    }
+
+    if (pair->vflag_atom) {
+      if (newton_pair || i < nlocal) {
+	vatom_thr[tid][i][0] += 0.5*v[0];
+	vatom_thr[tid][i][1] += 0.5*v[1];
+	vatom_thr[tid][i][2] += 0.5*v[2];
+	vatom_thr[tid][i][3] += 0.5*v[3];
+	vatom_thr[tid][i][4] += 0.5*v[4];
+	vatom_thr[tid][i][5] += 0.5*v[5];
+      }
+      if (newton_pair || j < nlocal) {
+	vatom_thr[tid][j][0] += 0.5*v[0];
+	vatom_thr[tid][j][1] += 0.5*v[1];
+	vatom_thr[tid][j][2] += 0.5*v[2];
+	vatom_thr[tid][j][3] += 0.5*v[3];
+	vatom_thr[tid][j][4] += 0.5*v[4];
+	vatom_thr[tid][j][5] += 0.5*v[5];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   reduce the per thread accumulated E/V data into the canonical accumulators.
+------------------------------------------------------------------------- */
+void ThrOMP::ev_reduce_thr(Pair *pair)
+{
+  const int nthreads = lmp->comm->nthreads;
+  const int ntotal = (lmp->force->newton) ? 
+    (lmp->atom->nlocal + lmp->atom->nghost) : lmp->atom->nlocal;
+
+  for (int n = 0; n < nthreads; ++n) {
+    pair->eng_vdwl += eng_vdwl_thr[n];
+    pair->eng_coul += eng_coul_thr[n];
+    if (pair->vflag_either) {
+      pair->virial[0] += virial_thr[n][0];
+      pair->virial[1] += virial_thr[n][1];
+      pair->virial[2] += virial_thr[n][2];
+      pair->virial[3] += virial_thr[n][3];
+      pair->virial[4] += virial_thr[n][4];
+      pair->virial[5] += virial_thr[n][5];
+      if (pair->vflag_atom) {
+        for (int i = 0; i < ntotal; ++i) {
+          pair->vatom[i][0] += vatom_thr[n][i][0];
+          pair->vatom[i][1] += vatom_thr[n][i][1];
+          pair->vatom[i][2] += vatom_thr[n][i][2];
+          pair->vatom[i][3] += vatom_thr[n][i][3];
+          pair->vatom[i][4] += vatom_thr[n][i][4];
+          pair->vatom[i][5] += vatom_thr[n][i][5];
+        }
+      }
+    }
+    if (pair->eflag_atom) {
+      for (int i = 0; i < ntotal; ++i) {
+        pair->eatom[i] += eatom_thr[n][i];
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+// set loop range thread id, and force array offset for threaded runs.
+double **ThrOMP::loop_setup_thr(double **f, int &ifrom, int &ito, int &tid,
+				int inum, int nall, int nthreads)
+{
+#if defined(_OPENMP)
+  if (nthreads > 1) {
+    tid = omp_get_thread_num();
+
+    // each thread works on a fixed chunk of atoms.
+    const int idelta = 1 + inum/nthreads;
+    ifrom = tid*idelta;
+    ito   = ifrom + idelta;
+    if (ito > inum)
+      ito = inum;
+
+    return f + nall*tid;
+
+  } else {
+#endif
+    tid = 0;
+    ifrom = 0;
+    ito = inum;
+    return f;
+#if defined(_OPENMP)
+  }
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+// reduce per thread forces into the first part of the force
+// array that is used for the non-threaded parts and reset
+// the temporary storage to 0.0. this routine depends on the
+// forces arrays stored in this order x1,y1,z1,x2,y2,z2,...
+// we need to post a barrier to wait until all threads are done
+// with computing forces.
+void ThrOMP::force_reduce_thr(double *fall, int nall,
+			      int nthreads, int tid)
+{
+#if defined(_OPENMP)
+  // NOOP in non-threaded execution.
+  if (nthreads == 1) return;
+#pragma omp barrier
+  {
+    double *f;
+    const int idelta = 1 + nall/nthreads;
+    const int ifrom = 3*tid*idelta;
+    const int ito   = 3*(((ifrom + idelta) > nall) ? nall : (ifrom + idelta));
+
+    for (int n = 1; n < nthreads; ++n) {
+      const int toffs = 3*n*nall;
+      f = fall + toffs;
+      for (int m = ifrom; m < ito; ++m) {
+	fall[m] += f[m];
+	f[m] = 0.0;
+      }
+    }
+  }
+#else
+  // NOOP in non-threaded execution.
+  return;
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+double ThrOMP::memory_usage_thr() 
+{
+  const int nthreads=lmp->comm->nthreads;
+
+  double bytes = nthreads * (3 + 7) * sizeof(double);
+  bytes += nthreads * maxeatom_thr * sizeof(double);
+  bytes += nthreads * maxvatom_thr * 6 * sizeof(double);
+  return bytes;
+}
--- a/src/USER-OMP/thr_omp.h
+++ b/src/USER-OMP/thr_omp.h
@ -0,0 +1,79 @@
+/* -*- c++ -*- -------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifndef LMP_THR_OMP_H
+#define LMP_THR_OMP_H
+
+#include "pointers.h"
+
+namespace LAMMPS_NS {
+
+// forward declarations
+class Pair;
+class Dihedral;
+
+class ThrOMP {
+
+ protected:
+  const int thr_style;
+  enum {PAIR=1, BOND, ANGLE, DIHEDRAL, IMPROPER, KSPACE, FIX, COMPUTE};
+
+  LAMMPS *lmp;           // reference to base lammps object.
+
+  double *eng_vdwl_thr;  // per thread accumulated vdw energy
+  double *eng_coul_thr;  // per thread accumulated coulomb energies
+  double *eng_bond_thr;  // per thread accumlated bonded energy
+
+  double **virial_thr;   // per thread virial
+  double **eatom_thr;    // per thread per atom energy
+  double ***vatom_thr;   // per thread per atom virial
+
+  int maxeatom_thr, maxvatom_thr;
+  
+ public:
+  ThrOMP(LAMMPS *, int);
+  virtual ~ThrOMP();
+
+  double memory_usage_thr();
+
+ protected:
+  // extra ev_tally work for threaded styles
+  void ev_setup_thr(Pair *);
+  void ev_setup_thr(Dihedral *);
+
+  void ev_reduce_thr(Pair *);
+  void ev_reduce_thr(Dihedral *);
+
+ private:
+  // internal method to be used by multiple ev_setup_thr() methods
+  void ev_zero_acc_thr(int, int, int, int, int, int);
+
+ protected:
+  // threading adapted versions of the ev_tally infrastructure
+  void ev_tally_thr(Pair *, int, int, int, int, double, double,
+		    double, double, double, double, int);
+
+ protected:
+  // set loop range, thread id, and force array offset for threaded runs.
+  double **loop_setup_thr(double **, int &, int &, int &, int, int, int);
+
+  // reduce per thread forces into the first part of the force array
+  void force_reduce_thr(double *, int, int, int);
+};
+
+}
+#endif
--- a/src/dihedral.h
+++ b/src/dihedral.h
@ -20,6 +20,8 @@
 namespace LAMMPS_NS {

 class Dihedral : protected Pointers {
+ friend class ThrOMP;
+
 public:
  int allocated;
  int *setflag;
--- a/src/pair.h
+++ b/src/pair.h
@ -22,6 +22,7 @@ class Pair : protected Pointers {
  friend class BondQuartic;
  friend class DihedralCharmm;
  friend class FixGPU;
+  friend class ThrOMP;

 public:
  double eng_vdwl,eng_coul;      // accumulated energies