git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@9404 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2013-02-08 16:57:12 +00:00 · 2013-02-08 16:57:12 +00:00 · 6676724eee
parent 9ce4e5b121
commit 6676724eee
32 changed files with 3100 additions and 114 deletions
--- a/src/USER-OMP/Install.sh
+++ b/src/USER-OMP/Install.sh
@ -1,11 +1,10 @@
 # Install/unInstall package files in LAMMPS
+
+# step 1: process all *_omp.cpp and *_omp.h files.
 # do not install child files if parent does not exist

 for file in *_omp.cpp *_omp.h ; do
-    # let us see if the "rain man" can count the toothpicks...
-   ofile=`echo $file | sed  \
-   -e s,\\\\\\(.\\*\\\\\\)_omp\\\\.h,\\\\1.h, \
-   -e s,\\\\\\(.\\*\\\\\\)_omp\\\\.cpp,\\\\1.cpp,`
+  ofile=`echo $file | sed  -e 's,\(.*\)_omp\.\(h\|cpp\),\1.\2,'
  if (test $1 = 1) then
    if (test $file = "thr_omp.h") || (test $file = "thr_omp.cpp") then
      :  # always install those files.
@ -20,6 +19,7 @@ for file in *_omp.cpp *_omp.h ; do
  fi
 done

+# step 2: handle cases and tasks not handled in step 1.
 if (test $1 = 1) then

  if (test -e ../Makefile.package) then
--- a/src/USER-OMP/Package.sh
+++ b/src/USER-OMP/Package.sh
@ -1,23 +1,16 @@
 # Update package files in LAMMPS
-# copy package file to src if it doesn't exists or is different
-# do not copy OpenMP style files, if a non-OpenMP version does 
-# not exist. Do remove OpenMP style files that have no matching
-# non-OpenMP version installed, e.g. after a package has been
-# removed
-for file in *_omp.cpp *_omp.h thr_data.h thr_data.cpp; do
-  # let us see if the "rain man" can count the toothpicks...
-   ofile=`echo $file | sed  \
-   -e s,\\\\\\(.\\*\\\\\\)_omp\\\\.\\\\\\(h\\\\\\|cpp\\\\\\),\\\\1.\\\\2,`
-  if (test $file = "thr_omp.h") || (test $file = "thr_omp.cpp") \
-      || (test $file = "thr_data.h") || (test $file = "thr_data.cpp") then
-    if (test ! -e ../$file) then
-      echo "  creating src/$file"
-      cp $file ..
-    elif ! cmp -s $file ../$file ; then
-      echo "  updating src/$file"
-      cp $file ..
-    fi
-  elif (test ! -e ../$ofile) then
+# Copy package file to src if it doesn't exists or is different.
+# But only copy the file, if a non-OpenMP version exists and
+# remove OpenMP versions that have no matching serial file
+# installed, e.g. after a package has been removed.
+for file in *_omp.cpp *_omp.h ; do
+  # these are special cases and handled below
+  if (test $file = "thr_omp.h") || (test $file = "thr_omp.cpp") then
+    continue
+  fi
+  # derive name of non-OpenMP version
+  ofile=`echo $file | sed   -e 's,\(.*\)_omp\.\(h\|cpp\),\1.\2,'`
+  if (test ! -e ../$ofile) then
    if (test -e ../$file) then
      echo "  removing src/$file"
      rm -f ../$file
@ -33,7 +26,8 @@ for file in *_omp.cpp *_omp.h thr_data.h thr_data.cpp; do
  fi
 done

-for file in thr_data.h thr_data.cpp; do
+# special case for files not covered by the automatic script above
+for file in thr_data.h thr_data.cpp thr_omp.h thr_omp.cpp; do
  if (test ! -e ../$file) then
    echo "  creating src/$file"
    cp $file ..
--- a/src/USER-OMP/angle_cosine_periodic_omp.cpp
+++ b/src/USER-OMP/angle_cosine_periodic_omp.cpp
@ -23,12 +23,14 @@
 #include "domain.h"

 #include "math_const.h"
+#include "math_special.h"

 #include <math.h>

 #include "suffix.h"
 using namespace LAMMPS_NS;
 using namespace MathConst;
+using namespace MathSpecial;

 #define SMALL 0.001

@ -161,8 +163,8 @@ void AngleCosinePeriodicOMP::eval(int nfrom, int nto, ThrData * const thr)
      un_2 = un_1;
      un_1 = un;
    }
-    tn = b_factor*pow(-1.0,(double)m)*tn;
-    un = b_factor*pow(-1.0,(double)m)*m*un;
+    tn = b_factor*powsign(m)*tn;
+    un = b_factor*powsign(m)*m*un;

    if (EFLAG) eangle = 2*k[type]*(1.0 - tn);

--- a/src/USER-OMP/angle_fourier_omp.cpp
+++ b/src/USER-OMP/angle_fourier_omp.cpp
@ -0,0 +1,171 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "angle_fourier_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include "math_const.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+AngleFourierOMP::AngleFourierOMP(class LAMMPS *lmp)
+  : AngleFourier(lmp), ThrOMP(lmp,THR_ANGLE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleFourierOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nanglelist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void AngleFourierOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,n,type;
+  double delx1,dely1,delz1,delx2,dely2,delz2;
+  double eangle,f1[3],f3[3];
+  double term;
+  double rsq1,rsq2,r1,r2,c,c2,a,a11,a12,a22;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const anglelist = neighbor->anglelist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = anglelist[n][0];
+    i2 = anglelist[n][1];
+    i3 = anglelist[n][2];
+    type = anglelist[n][3];
+
+    // 1st bond
+
+    delx1 = x[i1][0] - x[i2][0];
+    dely1 = x[i1][1] - x[i2][1];
+    delz1 = x[i1][2] - x[i2][2];
+
+    rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
+    r1 = sqrt(rsq1);
+
+    // 2nd bond
+
+    delx2 = x[i3][0] - x[i2][0];
+    dely2 = x[i3][1] - x[i2][1];
+    delz2 = x[i3][2] - x[i2][2];
+
+    rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
+    r2 = sqrt(rsq2);
+
+    // angle (cos and sin)
+
+    c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+    c /= r1*r2;
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    // force & energy
+
+    c2 = 2.0*c*c-1.0;
+    term = k[type]*(C0[type]+C1[type]*c+C2[type]*c2);
+
+    if (EFLAG) eangle = term;
+
+    a = k[type]*(C1[type]+4.0*C2[type]*c);
+    a11 = a*c / rsq1;
+    a12 = -a / (r1*r2);
+    a22 = a*c / rsq2;
+
+    f1[0] = a11*delx1 + a12*delx2;
+    f1[1] = a11*dely1 + a12*dely2;
+    f1[2] = a11*delz1 + a12*delz2;
+    f3[0] = a22*delx2 + a12*delx1;
+    f3[1] = a22*dely2 + a12*dely1;
+    f3[2] = a22*delz2 + a12*delz1;
+
+    // apply force to each of 3 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= f1[0] + f3[0];
+      f[i2][1] -= f1[1] + f3[1];
+      f[i2][2] -= f1[2] + f3[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,i3,nlocal,NEWTON_BOND,eangle,f1,f3,
+                             delx1,dely1,delz1,delx2,dely2,delz2,thr);
+  }
+}
--- a/src/USER-OMP/angle_fourier_omp.h
+++ b/src/USER-OMP/angle_fourier_omp.h
@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef ANGLE_CLASS
+
+AngleStyle(fourier/omp,AngleFourierOMP)
+
+#else
+
+#ifndef LMP_ANGLE_FOURIER_OMP_H
+#define LMP_ANGLE_FOURIER_OMP_H
+
+#include "angle_fourier.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class AngleFourierOMP : public AngleFourier, public ThrOMP {
+
+ public:
+  AngleFourierOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/angle_fourier_simple_omp.cpp
+++ b/src/USER-OMP/angle_fourier_simple_omp.cpp
@ -0,0 +1,188 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "angle_fourier_simple_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include "math_const.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+AngleFourierSimpleOMP::AngleFourierSimpleOMP(class LAMMPS *lmp)
+  : AngleFourierSimple(lmp), ThrOMP(lmp,THR_ANGLE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleFourierSimpleOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nanglelist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void AngleFourierSimpleOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,n,type;
+  double delx1,dely1,delz1,delx2,dely2,delz2;
+  double eangle,f1[3],f3[3];
+  double term,sgn;
+  double rsq1,rsq2,r1,r2,c,cn,th,nth,a,a11,a12,a22;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const anglelist = neighbor->anglelist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = anglelist[n][0];
+    i2 = anglelist[n][1];
+    i3 = anglelist[n][2];
+    type = anglelist[n][3];
+
+    // 1st bond
+
+    delx1 = x[i1][0] - x[i2][0];
+    dely1 = x[i1][1] - x[i2][1];
+    delz1 = x[i1][2] - x[i2][2];
+
+    rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
+    r1 = sqrt(rsq1);
+
+    // 2nd bond
+
+    delx2 = x[i3][0] - x[i2][0];
+    dely2 = x[i3][1] - x[i2][1];
+    delz2 = x[i3][2] - x[i2][2];
+
+    rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
+    r2 = sqrt(rsq2);
+
+    // angle (cos and sin)
+
+    c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+    c /= r1*r2;
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    // force & energy
+
+    th = acos(c);
+    nth = N[type]*acos(c);
+    cn = cos(nth);
+    term = k[type]*(1.0+C[type]*cn);
+
+    if (EFLAG) eangle = term;
+
+    // handle sin(n th)/sin(th) singulatiries
+
+    if ( fabs(c)-1.0 > 0.0001 ) {
+      a = k[type]*C[type]*N[type]*sin(nth)/sin(th);
+    } else {
+      if ( c >= 0.0 ) {
+        term = 1.0 - c;
+        sgn = 1.0;
+      } else {
+        term = 1.0 + c;
+        sgn = ( fmodf((float)(N[type]),2.0) == 0.0f )?-1.0:1.0;
+      }
+      a = N[type]+N[type]*(1.0-N[type]*N[type])*term/3.0;
+      a = k[type]*C[type]*N[type]*(double)(sgn)*a;
+    }
+
+    a11 = a*c / rsq1;
+    a12 = -a / (r1*r2);
+    a22 = a*c / rsq2;
+
+    f1[0] = a11*delx1 + a12*delx2;
+    f1[1] = a11*dely1 + a12*dely2;
+    f1[2] = a11*delz1 + a12*delz2;
+    f3[0] = a22*delx2 + a12*delx1;
+    f3[1] = a22*dely2 + a12*dely1;
+    f3[2] = a22*delz2 + a12*delz1;
+
+    // apply force to each of 3 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= f1[0] + f3[0];
+      f[i2][1] -= f1[1] + f3[1];
+      f[i2][2] -= f1[2] + f3[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,i3,nlocal,NEWTON_BOND,eangle,f1,f3,
+                             delx1,dely1,delz1,delx2,dely2,delz2,thr);
+  }
+}
--- a/src/USER-OMP/angle_fourier_simple_omp.h
+++ b/src/USER-OMP/angle_fourier_simple_omp.h
@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef ANGLE_CLASS
+
+AngleStyle(fourier/simple/omp,AngleFourierSimpleOMP)
+
+#else
+
+#ifndef LMP_ANGLE_FOURIER_SIMPLE_OMP_H
+#define LMP_ANGLE_FOURIER_SIMPLE_OMP_H
+
+#include "angle_fourier_simple.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class AngleFourierSimpleOMP : public AngleFourierSimple, public ThrOMP {
+
+ public:
+  AngleFourierSimpleOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/angle_quartic_omp.cpp
+++ b/src/USER-OMP/angle_quartic_omp.cpp
@ -0,0 +1,180 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "angle_quartic_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include "math_const.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+AngleQuarticOMP::AngleQuarticOMP(class LAMMPS *lmp)
+  : AngleQuartic(lmp), ThrOMP(lmp,THR_ANGLE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleQuarticOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nanglelist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void AngleQuarticOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,n,type;
+  double delx1,dely1,delz1,delx2,dely2,delz2;
+  double eangle,f1[3],f3[3];
+  double dtheta,dtheta2,dtheta3,dtheta4,tk;
+  double rsq1,rsq2,r1,r2,c,s,a,a11,a12,a22;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const anglelist = neighbor->anglelist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = anglelist[n][0];
+    i2 = anglelist[n][1];
+    i3 = anglelist[n][2];
+    type = anglelist[n][3];
+
+    // 1st bond
+
+    delx1 = x[i1][0] - x[i2][0];
+    dely1 = x[i1][1] - x[i2][1];
+    delz1 = x[i1][2] - x[i2][2];
+
+    rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
+    r1 = sqrt(rsq1);
+
+    // 2nd bond
+
+    delx2 = x[i3][0] - x[i2][0];
+    dely2 = x[i3][1] - x[i2][1];
+    delz2 = x[i3][2] - x[i2][2];
+
+    rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
+    r2 = sqrt(rsq2);
+
+    // angle (cos and sin)
+
+    c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+    c /= r1*r2;
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    s = sqrt(1.0 - c*c);
+    if (s < SMALL) s = SMALL;
+    s = 1.0/s;
+
+    // force & energy
+
+    dtheta = acos(c) - theta0[type];
+    dtheta2 = dtheta * dtheta;
+    dtheta3 = dtheta2 * dtheta;
+    tk =  2.0 * k2[type] * dtheta + 3.0 * k3[type] * dtheta2 + 4.0 * k4[type] * dtheta3;
+
+    if (EFLAG) {
+      dtheta4 = dtheta3 * dtheta;
+      eangle = k2[type] * dtheta2 + k3[type] * dtheta3 + k4[type] * dtheta4;
+    }
+
+    a = -2.0 * tk * s;
+    a11 = a*c / rsq1;
+    a12 = -a / (r1*r2);
+    a22 = a*c / rsq2;
+
+    f1[0] = a11*delx1 + a12*delx2;
+    f1[1] = a11*dely1 + a12*dely2;
+    f1[2] = a11*delz1 + a12*delz2;
+    f3[0] = a22*delx2 + a12*delx1;
+    f3[1] = a22*dely2 + a12*dely1;
+    f3[2] = a22*delz2 + a12*delz1;
+
+    // apply force to each of 3 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= f1[0] + f3[0];
+      f[i2][1] -= f1[1] + f3[1];
+      f[i2][2] -= f1[2] + f3[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,i3,nlocal,NEWTON_BOND,eangle,f1,f3,
+                             delx1,dely1,delz1,delx2,dely2,delz2,thr);
+  }
+}
--- a/src/USER-OMP/angle_quartic_omp.h
+++ b/src/USER-OMP/angle_quartic_omp.h
@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef ANGLE_CLASS
+
+AngleStyle(quartic/omp,AngleQuarticOMP)
+
+#else
+
+#ifndef LMP_ANGLE_QUARTIC_OMP_H
+#define LMP_ANGLE_QUARTIC_OMP_H
+
+#include "angle_quartic.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class AngleQuarticOMP : public AngleQuartic, public ThrOMP {
+
+ public:
+  AngleQuarticOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/dihedral_fourier_omp.cpp
+++ b/src/USER-OMP/dihedral_fourier_omp.cpp
@ -0,0 +1,274 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "dihedral_fourier_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+
+#include "math_const.h"
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define TOLERANCE 0.05
+
+/* ---------------------------------------------------------------------- */
+
+DihedralFourierOMP::DihedralFourierOMP(class LAMMPS *lmp)
+  : DihedralFourier(lmp), ThrOMP(lmp,THR_DIHEDRAL)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralFourierOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->ndihedrallist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void DihedralFourierOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,i4,i,j,m,n,type;
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,vb2xm,vb2ym,vb2zm;
+  double edihedral,f1[3],f2[3],f3[3],f4[3];
+  double ax,ay,az,bx,by,bz,rasq,rbsq,rgsq,rg,rginv,ra2inv,rb2inv,rabinv;
+  double df,df1_,ddf1_,fg,hg,fga,hgb,gaa,gbb;
+  double dtfx,dtfy,dtfz,dtgx,dtgy,dtgz,dthx,dthy,dthz;  
+  double c,s,p_,sx2,sy2,sz2;
+
+  edihedral = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const dihedrallist = neighbor->dihedrallist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = dihedrallist[n][0];
+    i2 = dihedrallist[n][1];
+    i3 = dihedrallist[n][2];
+    i4 = dihedrallist[n][3];
+    type = dihedrallist[n][4];
+
+    // 1st bond
+
+    vb1x = x[i1][0] - x[i2][0];
+    vb1y = x[i1][1] - x[i2][1];
+    vb1z = x[i1][2] - x[i2][2];
+
+    // 2nd bond
+
+    vb2x = x[i3][0] - x[i2][0];
+    vb2y = x[i3][1] - x[i2][1];
+    vb2z = x[i3][2] - x[i2][2];
+
+    vb2xm = -vb2x;
+    vb2ym = -vb2y;
+    vb2zm = -vb2z;
+
+    // 3rd bond
+
+    vb3x = x[i4][0] - x[i3][0];
+    vb3y = x[i4][1] - x[i3][1];
+    vb3z = x[i4][2] - x[i3][2];
+
+    ax = vb1y*vb2zm - vb1z*vb2ym;
+    ay = vb1z*vb2xm - vb1x*vb2zm;
+    az = vb1x*vb2ym - vb1y*vb2xm;
+    bx = vb3y*vb2zm - vb3z*vb2ym;
+    by = vb3z*vb2xm - vb3x*vb2zm;
+    bz = vb3x*vb2ym - vb3y*vb2xm;
+
+    rasq = ax*ax + ay*ay + az*az;
+    rbsq = bx*bx + by*by + bz*bz;
+    rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
+    rg = sqrt(rgsq);
+
+    rginv = ra2inv = rb2inv = 0.0;
+    if (rg > 0) rginv = 1.0/rg;
+    if (rasq > 0) ra2inv = 1.0/rasq;
+    if (rbsq > 0) rb2inv = 1.0/rbsq;
+    rabinv = sqrt(ra2inv*rb2inv);
+
+    c = (ax*bx + ay*by + az*bz)*rabinv;
+    s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
+
+    // error check
+
+    if (c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) {
+      int me = comm->me;
+
+      if (screen) {
+        char str[128];
+        sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " %d %d %d %d",
+                me,thr->get_tid(),update->ntimestep,
+                atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+        error->warning(FLERR,str,0);
+        fprintf(screen,"  1st atom: %d %g %g %g\n",
+                me,x[i1][0],x[i1][1],x[i1][2]);
+        fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                me,x[i2][0],x[i2][1],x[i2][2]);
+        fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                me,x[i3][0],x[i3][1],x[i3][2]);
+        fprintf(screen,"  4th atom: %d %g %g %g\n",
+                me,x[i4][0],x[i4][1],x[i4][2]);
+      }
+    }
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    // force and energy
+    // p = sum(i=1,nterms) k_i*(1+cos(n_i*phi-d_i)
+    // dp = dp / dphi
+    edihedral = 0.0;
+    df = 0.0;
+    for (j=0; j<nterms[type]; j++) {
+      m = multiplicity[type][j];
+      p_ = 1.0;
+      df1_ = 0.0;
+    
+      for (i = 0; i < m; i++) {
+        ddf1_ = p_*c - df1_*s;
+        df1_ = p_*s + df1_*c;
+        p_ = ddf1_;
+      }
+
+      p_ = p_*cos_shift[type][j] + df1_*sin_shift[type][j];
+      df1_ = df1_*cos_shift[type][j] - ddf1_*sin_shift[type][j];
+      df1_ *= -m;
+      p_ += 1.0;
+ 
+      if (m == 0) {
+        p_ = 1.0 + cos_shift[type][j];
+        df1_ = 0.0;
+      }
+
+      if (EFLAG) edihedral += k[type][j] * p_; 
+
+      df += (-k[type][j] * df1_);
+    }
+
+    fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
+    hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
+    fga = fg*ra2inv*rginv;
+    hgb = hg*rb2inv*rginv;
+    gaa = -ra2inv*rg;
+    gbb = rb2inv*rg;
+
+    dtfx = gaa*ax;
+    dtfy = gaa*ay;
+    dtfz = gaa*az;
+    dtgx = fga*ax - hgb*bx;
+    dtgy = fga*ay - hgb*by;
+    dtgz = fga*az - hgb*bz;
+    dthx = gbb*bx;
+    dthy = gbb*by;
+    dthz = gbb*bz;
+
+    sx2 = df*dtgx;
+    sy2 = df*dtgy;
+    sz2 = df*dtgz;
+
+    f1[0] = df*dtfx;
+    f1[1] = df*dtfy;
+    f1[2] = df*dtfz;
+
+    f2[0] = sx2 - f1[0];
+    f2[1] = sy2 - f1[1];
+    f2[2] = sz2 - f1[2];
+
+    f4[0] = df*dthx;
+    f4[1] = df*dthy;
+    f4[2] = df*dthz;
+
+    f3[0] = -sx2 - f4[0];
+    f3[1] = -sy2 - f4[1];
+    f3[2] = -sz2 - f4[2];
+
+    // apply force to each of 4 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] += f2[0];
+      f[i2][1] += f2[1];
+      f[i2][2] += f2[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (NEWTON_BOND || i4 < nlocal) {
+      f[i4][0] += f4[0];
+      f[i4][1] += f4[1];
+      f[i4][2] += f4[2];
+    }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,edihedral,f1,f3,f4,
+                   vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,thr);
+  }
+}
--- a/src/USER-OMP/dihedral_fourier_omp.h
+++ b/src/USER-OMP/dihedral_fourier_omp.h
@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+
+DihedralStyle(fourier/omp,DihedralFourierOMP)
+
+#else
+
+#ifndef LMP_DIHEDRAL_FOURIER_OMP_H
+#define LMP_DIHEDRAL_FOURIER_OMP_H
+
+#include "dihedral_fourier.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class DihedralFourierOMP : public DihedralFourier, public ThrOMP {
+
+ public:
+  DihedralFourierOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/dihedral_nharmonic_omp.cpp
+++ b/src/USER-OMP/dihedral_nharmonic_omp.cpp
@ -0,0 +1,273 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "dihedral_nharmonic_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define TOLERANCE 0.05
+#define SMALL     0.001
+
+/* ---------------------------------------------------------------------- */
+
+DihedralNHarmonicOMP::DihedralNHarmonicOMP(class LAMMPS *lmp)
+  : DihedralNHarmonic(lmp), ThrOMP(lmp,THR_DIHEDRAL)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralNHarmonicOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->ndihedrallist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void DihedralNHarmonicOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,i4,n,type;
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,vb2xm,vb2ym,vb2zm;
+  double edihedral,f1[3],f2[3],f3[3],f4[3];
+  double sb1,sb2,sb3,rb1,rb3,c0,b1mag2,b1mag,b2mag2;
+  double b2mag,b3mag2,b3mag,ctmp,r12c1,c1mag,r12c2;
+  double c2mag,sc1,sc2,s1,s12,c,p,pd,a11,a22;
+  double a33,a12,a13,a23,sx2,sy2,sz2;
+  double s2,sin2;
+
+  edihedral = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const dihedrallist = neighbor->dihedrallist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = dihedrallist[n][0];
+    i2 = dihedrallist[n][1];
+    i3 = dihedrallist[n][2];
+    i4 = dihedrallist[n][3];
+    type = dihedrallist[n][4];
+
+    // 1st bond
+
+    vb1x = x[i1][0] - x[i2][0];
+    vb1y = x[i1][1] - x[i2][1];
+    vb1z = x[i1][2] - x[i2][2];
+
+    // 2nd bond
+
+    vb2x = x[i3][0] - x[i2][0];
+    vb2y = x[i3][1] - x[i2][1];
+    vb2z = x[i3][2] - x[i2][2];
+
+    vb2xm = -vb2x;
+    vb2ym = -vb2y;
+    vb2zm = -vb2z;
+
+    // 3rd bond
+
+    vb3x = x[i4][0] - x[i3][0];
+    vb3y = x[i4][1] - x[i3][1];
+    vb3z = x[i4][2] - x[i3][2];
+
+    // c0 calculation
+
+    sb1 = 1.0 / (vb1x*vb1x + vb1y*vb1y + vb1z*vb1z);
+    sb2 = 1.0 / (vb2x*vb2x + vb2y*vb2y + vb2z*vb2z);
+    sb3 = 1.0 / (vb3x*vb3x + vb3y*vb3y + vb3z*vb3z);
+
+    rb1 = sqrt(sb1);
+    rb3 = sqrt(sb3);
+
+    c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
+
+    // 1st and 2nd angle
+
+    b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
+    b1mag = sqrt(b1mag2);
+    b2mag2 = vb2x*vb2x + vb2y*vb2y + vb2z*vb2z;
+    b2mag = sqrt(b2mag2);
+    b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
+    b3mag = sqrt(b3mag2);
+
+    ctmp = vb1x*vb2x + vb1y*vb2y + vb1z*vb2z;
+    r12c1 = 1.0 / (b1mag*b2mag);
+    c1mag = ctmp * r12c1;
+
+    ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
+    r12c2 = 1.0 / (b2mag*b3mag);
+    c2mag = ctmp * r12c2;
+
+    // cos and sin of 2 angles and final c
+
+    sin2 = MAX(1.0 - c1mag*c1mag,0.0);
+    sc1 = sqrt(sin2);
+    if (sc1 < SMALL) sc1 = SMALL;
+    sc1 = 1.0/sc1;
+
+    sin2 = MAX(1.0 - c2mag*c2mag,0.0);
+    sc2 = sqrt(sin2);
+    if (sc2 < SMALL) sc2 = SMALL;
+    sc2 = 1.0/sc2;
+
+    s1 = sc1 * sc1;
+    s2 = sc2 * sc2;
+    s12 = sc1 * sc2;
+    c = (c0 + c1mag*c2mag) * s12;
+
+    // error check
+
+    if (c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) {
+      int me = comm->me;
+
+      if (screen) {
+        char str[128];
+        sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " %d %d %d %d",
+                me,thr->get_tid(),update->ntimestep,
+                atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+        error->warning(FLERR,str,0);
+        fprintf(screen,"  1st atom: %d %g %g %g\n",
+                me,x[i1][0],x[i1][1],x[i1][2]);
+        fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                me,x[i2][0],x[i2][1],x[i2][2]);
+        fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                me,x[i3][0],x[i3][1],x[i3][2]);
+        fprintf(screen,"  4th atom: %d %g %g %g\n",
+                me,x[i4][0],x[i4][1],x[i4][2]);
+      }
+    }
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    // force & energy
+    // p = sum (i=1,n) a_i * c**(i-1)
+    // pd = dp/dc
+    p = a[type][0];
+    pd = a[type][1];
+    for (int i = 1; i < nterms[type]-1; i++) {
+      p += c * a[type][i];
+      pd += c * static_cast<double>(i+1) * a[type][i+1];
+      c *= c;
+    }
+    p += c * a[type][nterms[type]-1];
+
+    if (EFLAG) edihedral = p;
+
+    c = c * pd;
+    s12 = s12 * pd;
+    a11 = c*sb1*s1;
+    a22 = -sb2 * (2.0*c0*s12 - c*(s1+s2));
+    a33 = c*sb3*s2;
+    a12 = -r12c1*(c1mag*c*s1 + c2mag*s12);
+    a13 = -rb1*rb3*s12;
+    a23 = r12c2*(c2mag*c*s2 + c1mag*s12);
+
+    sx2  = a12*vb1x + a22*vb2x + a23*vb3x;
+    sy2  = a12*vb1y + a22*vb2y + a23*vb3y;
+    sz2  = a12*vb1z + a22*vb2z + a23*vb3z;
+
+    f1[0] = a11*vb1x + a12*vb2x + a13*vb3x;
+    f1[1] = a11*vb1y + a12*vb2y + a13*vb3y;
+    f1[2] = a11*vb1z + a12*vb2z + a13*vb3z;
+
+    f2[0] = -sx2 - f1[0];
+    f2[1] = -sy2 - f1[1];
+    f2[2] = -sz2 - f1[2];
+
+    f4[0] = a13*vb1x + a23*vb2x + a33*vb3x;
+    f4[1] = a13*vb1y + a23*vb2y + a33*vb3y;
+    f4[2] = a13*vb1z + a23*vb2z + a33*vb3z;
+
+    f3[0] = sx2 - f4[0];
+    f3[1] = sy2 - f4[1];
+    f3[2] = sz2 - f4[2];
+
+    // apply force to each of 4 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] += f2[0];
+      f[i2][1] += f2[1];
+      f[i2][2] += f2[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (NEWTON_BOND || i4 < nlocal) {
+      f[i4][0] += f4[0];
+      f[i4][1] += f4[1];
+      f[i4][2] += f4[2];
+    }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,edihedral,f1,f3,f4,
+                   vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,thr);
+  }
+}
--- a/src/USER-OMP/dihedral_nharmonic_omp.h
+++ b/src/USER-OMP/dihedral_nharmonic_omp.h
@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+
+DihedralStyle(nharmonic/omp,DihedralNHarmonicOMP)
+
+#else
+
+#ifndef LMP_DIHEDRAL_NHARMONIC_OMP_H
+#define LMP_DIHEDRAL_NHARMONIC_OMP_H
+
+#include "dihedral_nharmonic.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class DihedralNHarmonicOMP : public DihedralNHarmonic, public ThrOMP {
+
+ public:
+  DihedralNHarmonicOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/dihedral_quadratic_omp.cpp
+++ b/src/USER-OMP/dihedral_quadratic_omp.cpp
@ -0,0 +1,286 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "dihedral_quadratic_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define TOLERANCE 0.05
+#define SMALL     0.001
+#define SMALLER   0.00001
+
+/* ---------------------------------------------------------------------- */
+
+DihedralQuadraticOMP::DihedralQuadraticOMP(class LAMMPS *lmp)
+  : DihedralQuadratic(lmp), ThrOMP(lmp,THR_DIHEDRAL)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralQuadraticOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->ndihedrallist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void DihedralQuadraticOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,i4,n,type;
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,vb2xm,vb2ym,vb2zm;
+  double edihedral,f1[3],f2[3],f3[3],f4[3];
+  double sb1,sb2,sb3,rb1,rb3,c0,b1mag2,b1mag,b2mag2;
+  double b2mag,b3mag2,b3mag,ctmp,r12c1,c1mag,r12c2;
+  double c2mag,sc1,sc2,s1,s12,c,p,pd,a,a11,a22;
+  double a33,a12,a13,a23,sx2,sy2,sz2;
+  double s2,cx,cy,cz,cmag,dx,phi,si,sin2;
+
+  edihedral = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const dihedrallist = neighbor->dihedrallist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = dihedrallist[n][0];
+    i2 = dihedrallist[n][1];
+    i3 = dihedrallist[n][2];
+    i4 = dihedrallist[n][3];
+    type = dihedrallist[n][4];
+
+    // 1st bond
+
+    vb1x = x[i1][0] - x[i2][0];
+    vb1y = x[i1][1] - x[i2][1];
+    vb1z = x[i1][2] - x[i2][2];
+
+    // 2nd bond
+
+    vb2x = x[i3][0] - x[i2][0];
+    vb2y = x[i3][1] - x[i2][1];
+    vb2z = x[i3][2] - x[i2][2];
+
+    vb2xm = -vb2x;
+    vb2ym = -vb2y;
+    vb2zm = -vb2z;
+
+    // 3rd bond
+
+    vb3x = x[i4][0] - x[i3][0];
+    vb3y = x[i4][1] - x[i3][1];
+    vb3z = x[i4][2] - x[i3][2];
+
+    // c0 calculation
+
+    sb1 = 1.0 / (vb1x*vb1x + vb1y*vb1y + vb1z*vb1z);
+    sb2 = 1.0 / (vb2x*vb2x + vb2y*vb2y + vb2z*vb2z);
+    sb3 = 1.0 / (vb3x*vb3x + vb3y*vb3y + vb3z*vb3z);
+
+    rb1 = sqrt(sb1);
+    rb3 = sqrt(sb3);
+
+    c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
+
+    // 1st and 2nd angle
+
+    b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
+    b1mag = sqrt(b1mag2);
+    b2mag2 = vb2x*vb2x + vb2y*vb2y + vb2z*vb2z;
+    b2mag = sqrt(b2mag2);
+    b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
+    b3mag = sqrt(b3mag2);
+
+    ctmp = vb1x*vb2x + vb1y*vb2y + vb1z*vb2z;
+    r12c1 = 1.0 / (b1mag*b2mag);
+    c1mag = ctmp * r12c1;
+
+    ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
+    r12c2 = 1.0 / (b2mag*b3mag);
+    c2mag = ctmp * r12c2;
+
+    // cos and sin of 2 angles and final c
+
+    sin2 = MAX(1.0 - c1mag*c1mag,0.0);
+    sc1 = sqrt(sin2);
+    if (sc1 < SMALL) sc1 = SMALL;
+    sc1 = 1.0/sc1;
+
+    sin2 = MAX(1.0 - c2mag*c2mag,0.0);
+    sc2 = sqrt(sin2);
+    if (sc2 < SMALL) sc2 = SMALL;
+    sc2 = 1.0/sc2;
+
+    s1 = sc1 * sc1;
+    s2 = sc2 * sc2;
+    s12 = sc1 * sc2;
+    c = (c0 + c1mag*c2mag) * s12;
+
+    cx = vb1y*vb2z - vb1z*vb2y;
+    cy = vb1z*vb2x - vb1x*vb2z;
+    cz = vb1x*vb2y - vb1y*vb2x;
+    cmag = sqrt(cx*cx + cy*cy + cz*cz);
+    dx = (cx*vb3x + cy*vb3y + cz*vb3z)/cmag/b3mag;
+
+    // error check
+
+    if (c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) {
+      int me = comm->me;
+
+      if (screen) {
+        char str[128];
+        sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " %d %d %d %d",
+                me,thr->get_tid(),update->ntimestep,
+                atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+        error->warning(FLERR,str,0);
+        fprintf(screen,"  1st atom: %d %g %g %g\n",
+                me,x[i1][0],x[i1][1],x[i1][2]);
+        fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                me,x[i2][0],x[i2][1],x[i2][2]);
+        fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                me,x[i3][0],x[i3][1],x[i3][2]);
+        fprintf(screen,"  4th atom: %d %g %g %g\n",
+                me,x[i4][0],x[i4][1],x[i4][2]);
+      }
+    }
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    // force & energy
+    // p = k ( phi- phi0)^2
+    // pd = dp/dc
+
+    phi = acos(c);
+    if (dx < 0.0) phi *= -1.0;
+    si = sin(phi);
+
+    double dphi = phi-phi0[type];
+    p = k[type]*dphi;
+    if (fabs(si) < SMALLER) {
+        pd = - 2.0 * k[type];
+    } else {
+        pd = - 2.0 * p / si;
+    }
+    p = p * dphi;
+
+    if (EFLAG) edihedral = p; 
+
+    a = pd;
+    c = c * a;
+    s12 = s12 * a;
+    a11 = c*sb1*s1;
+    a22 = -sb2 * (2.0*c0*s12 - c*(s1+s2));
+    a33 = c*sb3*s2;
+    a12 = -r12c1 * (c1mag*c*s1 + c2mag*s12);
+    a13 = -rb1*rb3*s12;
+    a23 = r12c2 * (c2mag*c*s2 + c1mag*s12);
+
+    sx2  = a12*vb1x + a22*vb2x + a23*vb3x;
+    sy2  = a12*vb1y + a22*vb2y + a23*vb3y;
+    sz2  = a12*vb1z + a22*vb2z + a23*vb3z;
+
+    f1[0] = a11*vb1x + a12*vb2x + a13*vb3x;
+    f1[1] = a11*vb1y + a12*vb2y + a13*vb3y;
+    f1[2] = a11*vb1z + a12*vb2z + a13*vb3z;
+
+    f2[0] = -sx2 - f1[0];
+    f2[1] = -sy2 - f1[1];
+    f2[2] = -sz2 - f1[2];
+
+    f4[0] = a13*vb1x + a23*vb2x + a33*vb3x;
+    f4[1] = a13*vb1y + a23*vb2y + a33*vb3y;
+    f4[2] = a13*vb1z + a23*vb2z + a33*vb3z;
+
+    f3[0] = sx2 - f4[0];
+    f3[1] = sy2 - f4[1];
+    f3[2] = sz2 - f4[2];
+
+    // apply force to each of 4 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] += f2[0];
+      f[i2][1] += f2[1];
+      f[i2][2] += f2[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (NEWTON_BOND || i4 < nlocal) {
+      f[i4][0] += f4[0];
+      f[i4][1] += f4[1];
+      f[i4][2] += f4[2];
+    }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,edihedral,f1,f3,f4,
+                   vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,thr);
+  }
+}
--- a/src/USER-OMP/dihedral_quadratic_omp.h
+++ b/src/USER-OMP/dihedral_quadratic_omp.h
@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+
+DihedralStyle(quadratic/omp,DihedralQuadraticOMP)
+
+#else
+
+#ifndef LMP_DIHEDRAL_QUADRATIC_OMP_H
+#define LMP_DIHEDRAL_QUADRATIC_OMP_H
+
+#include "dihedral_quadratic.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class DihedralQuadraticOMP : public DihedralQuadratic, public ThrOMP {
+
+ public:
+  DihedralQuadraticOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/dihedral_table_omp.cpp
+++ b/src/USER-OMP/dihedral_table_omp.cpp
@ -15,7 +15,10 @@
   Contributing author: Axel Kohlmeyer (Temple U)
 ------------------------------------------------------------------------- */

-#include "math.h"
+#include <cmath>
+#include <cstdlib>
+#include <cstdio>
+
 #include "dihedral_table_omp.h"
 #include "atom.h"
 #include "comm.h"
@ -25,12 +28,77 @@
 #include "update.h"
 #include "error.h"

+#include "math_const.h"
+#include "math_extra.h"
+
 #include "suffix.h"
+
 using namespace LAMMPS_NS;
-using namespace DIHEDRAL_TABLE_NS;
+using namespace MathConst;
+using namespace MathExtra;
+
 #define TOLERANCE 0.05
 #define SMALL     0.001

+// --------------------------------------------
+// ------- Calculate the dihedral angle -------
+// --------------------------------------------
+static const int g_dim=3;
+
+static double Phi(double const *x1, //array holding x,y,z coords atom 1
+                  double const *x2, // :       :      :      :        2
+                  double const *x3, // :       :      :      :        3
+                  double const *x4, // :       :      :      :        4
+                  Domain *domain, //<-periodic boundary information
+                  // The following arrays are of doubles with g_dim elements.
+                  // (g_dim is a constant known at compile time, usually 3).
+                  // Their contents is calculated by this function.
+                  // Space for these vectors must be allocated in advance.
+                  // (This is not hidden internally because these vectors
+                  //  may be needed outside the function, later on.)
+                  double *vb12, // will store x2-x1
+                  double *vb23, // will store x3-x2
+                  double *vb34, // will store x4-x3
+                  double *n123, // will store normal to plane x1,x2,x3
+                  double *n234) // will store normal to plane x2,x3,x4
+{
+
+  for (int d=0; d < g_dim; ++d) {
+    vb12[d] = x2[d] - x1[d]; // 1st bond
+    vb23[d] = x3[d] - x2[d]; // 2nd bond
+    vb34[d] = x4[d] - x3[d]; // 3rd bond
+  }
+
+  //Consider periodic boundary conditions:
+  domain->minimum_image(vb12[0],vb12[1],vb12[2]);
+  domain->minimum_image(vb23[0],vb23[1],vb23[2]);
+  domain->minimum_image(vb34[0],vb34[1],vb34[2]);
+
+  //--- Compute the normal to the planes formed by atoms 1,2,3 and 2,3,4 ---
+
+  cross3(vb23, vb12, n123);        // <- n123=vb23 x vb12
+  cross3(vb23, vb34, n234);        // <- n234=vb23 x vb34
+
+  norm3(n123);
+  norm3(n234);
+
+  double cos_phi = -dot3(n123, n234);
+
+  if (cos_phi > 1.0)
+    cos_phi = 1.0;
+  else if (cos_phi < -1.0)
+    cos_phi = -1.0;
+
+  double phi = acos(cos_phi);
+
+  if (dot3(n123, vb34) > 0.0) {
+    phi = -phi;   //(Note: Negative dihedral angles are possible only in 3-D.)
+    phi += MY_2PI; //<- This insure phi is always in the range 0 to 2*PI
+  }
+  return phi;
+} // DihedralTable::Phi()
+
+
 /* ---------------------------------------------------------------------- */

 DihedralTableOMP::DihedralTableOMP(class LAMMPS *lmp)
@ -178,9 +246,9 @@ void DihedralTableOMP::eval(int nfrom, int nto, ThrData * const thr)
    double dphi_dx3[g_dim]; //               d x[i1][d]
    double dphi_dx4[g_dim]; //where d=0,1,2 corresponds to x,y,z  (if g_dim==3)

-    double dot123             = DotProduct(vb12, vb23);
-    double dot234             = DotProduct(vb23, vb34);
-    double L23sqr             = DotProduct(vb23, vb23);
+    double dot123             = dot3(vb12, vb23);
+    double dot234             = dot3(vb23, vb34);
+    double L23sqr             = dot3(vb23, vb23);
    double L23                = sqrt(L23sqr);   // (central bond length)
    double inv_L23sqr = 0.0;
    double inv_L23    = 0.0;
@ -200,15 +268,14 @@ void DihedralTableOMP::eval(int nfrom, int nto, ThrData * const thr)
      perp34on23[d] = vb34[d] - proj34on23[d];
    }

-
    // --- Compute the gradient vectors dphi/dx1 and dphi/dx4: ---

    // These two gradients point in the direction of n123 and n234,
    // and are scaled by the distances of atoms 1 and 4 from the central axis.
    // Distance of atom 1 to central axis:
-    double perp12on23_len = sqrt(DotProduct(perp12on23, perp12on23));
+    double perp12on23_len = sqrt(dot3(perp12on23, perp12on23));
    // Distance of atom 4 to central axis:
-    double perp34on23_len = sqrt(DotProduct(perp34on23, perp34on23));
+    double perp34on23_len = sqrt(dot3(perp34on23, perp34on23));

    double inv_perp12on23 = 0.0;
    if (perp12on23_len != 0.0) inv_perp12on23 = 1.0 / perp12on23_len;
@ -265,33 +332,10 @@ void DihedralTableOMP::eval(int nfrom, int nto, ThrData * const thr)
    }


-
-
-    #ifdef DIH_DEBUG_NUM
-    // ----- Numerical test? -----
-
-    cerr << "  -- testing gradient for dihedral (n="<<n<<") for atoms ("
-         << i1 << "," << i2 << "," << i3 << "," << i4 << ") --" << endl;
-
-    PrintGradientComparison(*this, dphi_dx1, dphi_dx2, dphi_dx3, dphi_dx4,
-                            domain, x[i1], x[i2], x[i3], x[i4]);
-
-    for (int d=0; d < g_dim; ++d) {
-      // The sum of all the gradients should be near 0. (translational symmetry)
-      cerr <<"sum_gradients["<<d<<"]="<<dphi_dx1[d]<<"+"<<dphi_dx2[d]<<"+"<<dphi_dx3[d]<<"+"<<dphi_dx4[d]<<"="<<dphi_dx1[d]+dphi_dx2[d]+dphi_dx3[d]+dphi_dx4[d]<<endl;
-      // These should sum to zero
-      assert(abs(dphi_dx1[d]+dphi_dx2[d]+dphi_dx3[d]+dphi_dx4[d]) < 0.0002/L23);
-    }
-    #endif // #ifdef DIH_DEBUG_NUM
-
-
-
-
    // ----- Step 3: Calculate the energy and force in the phi direction -----

    // tabulated force & energy
    double u, m_du_dphi; //u = energy.   m_du_dphi = "minus" du/dphi
-    assert((0.0 <= phi) && (phi <= TWOPI));

    uf_lookup(type, phi, u, m_du_dphi);

--- a/src/USER-OMP/improper_fourier_omp.cpp
+++ b/src/USER-OMP/improper_fourier_omp.cpp
@ -0,0 +1,282 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "improper_fourier_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define TOLERANCE 0.05
+#define SMALL     0.001
+
+/* ---------------------------------------------------------------------- */
+
+ImproperFourierOMP::ImproperFourierOMP(class LAMMPS *lmp)
+  : ImproperFourier(lmp), ThrOMP(lmp,THR_IMPROPER)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ImproperFourierOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nimproperlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void ImproperFourierOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,i4,n,type;
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z;
+
+  const double * const * const x = atom->x;
+  const int * const * const improperlist = neighbor->improperlist;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = improperlist[n][0];
+    i2 = improperlist[n][1];
+    i3 = improperlist[n][2];
+    i4 = improperlist[n][3];
+    type = improperlist[n][4];
+
+    // 1st bond
+
+    vb1x = x[i2][0] - x[i1][0];
+    vb1y = x[i2][1] - x[i1][1];
+    vb1z = x[i2][2] - x[i1][2];
+
+    // 2nd bond
+
+    vb2x = x[i3][0] - x[i1][0];
+    vb2y = x[i3][1] - x[i1][1];
+    vb2z = x[i3][2] - x[i1][2];
+
+    // 3rd bond
+
+    vb3x = x[i4][0] - x[i1][0];
+    vb3y = x[i4][1] - x[i1][1];
+    vb3z = x[i4][2] - x[i1][2];
+
+    add1_thr<EVFLAG,EFLAG,NEWTON_BOND>(i1,i2,i3,i4,type,
+				       vb1x,vb1y,vb1z, 
+				       vb2x,vb2y,vb2z, 
+				       vb3x,vb3y,vb3z,thr);
+    if ( all[type] ) {
+      add1_thr<EVFLAG,EFLAG,NEWTON_BOND>(i1,i4,i2,i3,type,
+					 vb3x,vb3y,vb3z,
+					 vb1x,vb1y,vb1z, 
+					 vb2x,vb2y,vb2z,thr); 
+      add1_thr<EVFLAG,EFLAG,NEWTON_BOND>(i1,i3,i4,i2,type,
+					 vb2x,vb2y,vb2z, 
+					 vb3x,vb3y,vb3z,
+					 vb1x,vb1y,vb1z,thr); 
+    }
+  }
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void ImproperFourierOMP::add1_thr(const int i1,const int i2,
+			       const int i3,const int i4,
+			       const int type,
+			       const double &vb1x,
+			       const double &vb1y,
+			       const double &vb1z,
+			       const double &vb2x,
+			       const double &vb2y,
+			       const double &vb2z,
+			       const double &vb3x,
+			       const double &vb3y,
+			       const double &vb3z,
+			       ThrData * const thr)
+{
+  double eimproper,f1[3],f2[3],f3[3],f4[3];
+  double c,c2,a,s,projhfg,dhax,dhay,dhaz,dahx,dahy,dahz,cotphi;
+  double ax,ay,az,ra2,rh2,ra,rh,rar,rhr,arx,ary,arz,hrx,hry,hrz;
+
+  double * const * const f = thr->get_f();
+  const int nlocal = atom->nlocal;
+
+  eimproper = 0.0;
+
+  // c0 calculation
+  // A = vb1 X vb2 is perpendicular to IJK plane
+
+  ax = vb1y*vb2z-vb1z*vb2y;
+  ay = vb1z*vb2x-vb1x*vb2z;
+  az = vb1x*vb2y-vb1y*vb2x;
+  ra2 = ax*ax+ay*ay+az*az;
+  rh2 = vb3x*vb3x+vb3y*vb3y+vb3z*vb3z;
+  ra = sqrt(ra2);
+  rh = sqrt(rh2);
+  if (ra < SMALL) ra = SMALL;
+  if (rh < SMALL) rh = SMALL;
+
+  rar = 1/ra;
+  rhr = 1/rh;
+  arx = ax*rar;
+  ary = ay*rar;
+  arz = az*rar;
+  hrx = vb3x*rhr;
+  hry = vb3y*rhr;
+  hrz = vb3z*rhr;
+
+  c = arx*hrx+ary*hry+arz*hrz;
+
+  // error check
+
+  if (c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) {
+    int me = comm->me;
+
+    if (screen) {
+      char str[128];
+      sprintf(str,
+              "Improper problem: %d/%d " BIGINT_FORMAT " %d %d %d %d",
+              me,thr->get_tid(),update->ntimestep,
+              atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+      error->warning(FLERR,str,0);
+      fprintf(screen,"  1st atom: %d %g %g %g\n",
+              me,atom->x[i1][0],atom->x[i1][1],atom->x[i1][2]);
+      fprintf(screen,"  2nd atom: %d %g %g %g\n",
+              me,atom->x[i2][0],atom->x[i2][1],atom->x[i2][2]);
+      fprintf(screen,"  3rd atom: %d %g %g %g\n",
+              me,atom->x[i3][0],atom->x[i3][1],atom->x[i3][2]);
+      fprintf(screen,"  4th atom: %d %g %g %g\n",
+              me,atom->x[i4][0],atom->x[i4][1],atom->x[i4][2]);
+    }
+  }
+
+  if (c > 1.0) s = 1.0;
+  if (c < -1.0) s = -1.0;
+
+  s = sqrt(1.0 - c*c);
+  if (s < SMALL) s = SMALL;
+  cotphi = c/s;
+
+  projhfg = (vb3x*vb1x+vb3y*vb1y+vb3z*vb1z) /
+    sqrt(vb1x*vb1x+vb1y*vb1y+vb1z*vb1z);
+  projhfg += (vb3x*vb2x+vb3y*vb2y+vb3z*vb2z) /
+    sqrt(vb2x*vb2x+vb2y*vb2y+vb2z*vb2z);
+  if (projhfg > 0.0) {
+    s *= -1.0;
+    cotphi *= -1.0;
+  }
+
+  //  force and energy
+  //  E = k ( C0 + C1 cos(w) + C2 cos(w)
+
+  c2 = 2.0*s*s-1.0;
+  if (EFLAG) eimproper =  k[type]*(C0[type]+C1[type]*s+C2[type]*c2);
+
+  // dhax = diffrence between H and A in X direction, etc
+
+  a = k[type]*(C1[type]+4.0*C2[type]*s)*cotphi;
+  dhax = hrx-c*arx;
+  dhay = hry-c*ary;
+  dhaz = hrz-c*arz;
+
+  dahx = arx-c*hrx;
+  dahy = ary-c*hry;
+  dahz = arz-c*hrz;
+
+  f2[0] = (dhay*vb1z - dhaz*vb1y)*rar;
+  f2[1] = (dhaz*vb1x - dhax*vb1z)*rar;
+  f2[2] = (dhax*vb1y - dhay*vb1x)*rar;
+
+  f3[0] = (-dhay*vb2z + dhaz*vb2y)*rar;
+  f3[1] = (-dhaz*vb2x + dhax*vb2z)*rar;
+  f3[2] = (-dhax*vb2y + dhay*vb2x)*rar;
+
+  f4[0] = dahx*rhr;
+  f4[1] = dahy*rhr;
+  f4[2] = dahz*rhr;
+
+  f1[0] = -(f2[0] + f3[0] + f4[0]);
+  f1[1] = -(f2[1] + f3[1] + f4[1]);
+  f1[2] = -(f2[2] + f3[2] + f4[2]);
+
+  // apply force to each of 4 atoms
+
+  if (NEWTON_BOND || i1 < nlocal) {
+    f[i1][0] += f1[0]*a;
+    f[i1][1] += f1[1]*a;
+    f[i1][2] += f1[2]*a;
+  }
+
+  if (NEWTON_BOND || i2 < nlocal) {
+    f[i2][0] += f3[0]*a;
+    f[i2][1] += f3[1]*a;
+    f[i2][2] += f3[2]*a;
+  }
+
+  if (NEWTON_BOND || i3 < nlocal) {
+    f[i3][0] += f2[0]*a;
+    f[i3][1] += f2[1]*a;
+    f[i3][2] += f2[2]*a;
+  }
+
+  if (NEWTON_BOND || i4 < nlocal) {
+    f[i4][0] += f4[0]*a;
+    f[i4][1] += f4[1]*a;
+    f[i4][2] += f4[2]*a;
+  }
+
+  if (EVFLAG)
+    ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,eimproper,f1,f3,f4,
+		 vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,thr);
+}
--- a/src/USER-OMP/improper_fourier_omp.h
+++ b/src/USER-OMP/improper_fourier_omp.h
@ -0,0 +1,53 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef IMPROPER_CLASS
+
+ImproperStyle(fourier/omp,ImproperFourierOMP)
+
+#else
+
+#ifndef LMP_IMPROPER_FOURIER_OMP_H
+#define LMP_IMPROPER_FOURIER_OMP_H
+
+#include "improper_fourier.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class ImproperFourierOMP : public ImproperFourier, public ThrOMP {
+
+ public:
+  ImproperFourierOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void add1_thr(const int,const int,const int,const int,const int,
+		const double &, const double &, const double &,
+		const double &, const double &, const double &,
+		const double &, const double &, const double &,
+		ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/improper_ring_omp.cpp
+++ b/src/USER-OMP/improper_ring_omp.cpp
@ -24,9 +24,11 @@
 #include "force.h"
 #include "update.h"
 #include "error.h"
+#include "math_special.h"

 #include "suffix.h"
 using namespace LAMMPS_NS;
+using namespace MathSpecial;

 #define TOLERANCE 0.05
 #define SMALL     0.001
@ -167,7 +169,7 @@ void ImproperRingOMP::eval(int nfrom, int nto, ThrData * const thr)
      /* Append the current angle to the sum of angle differences. */
      angle_summer += (bend_angle[icomb] - chi[type]);
    }
-    if (EFLAG) eimproper = (1.0/6.0) *k[type] * pow(angle_summer,6.0);
+    if (EFLAG) eimproper = (1.0/6.0) *k[type] * powint(angle_summer,6);
    /*
      printf("The tags: %d-%d-%d-%d, of type %d .\n",atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4],type);
      // printf("The coordinates of the first: %f, %f, %f.\n", x[i1][0], x[i1][1], x[i1][2]);
@ -181,7 +183,7 @@ void ImproperRingOMP::eval(int nfrom, int nto, ThrData * const thr)

    /* Force calculation acting on all atoms.
       Calculate the derivatives of the potential. */
-    angfac = k[type] * pow(angle_summer,5.0);
+    angfac = k[type] * powint(angle_summer,5);

    f1[0] = 0.0; f1[1] = 0.0; f1[2] = 0.0;
    f3[0] = 0.0; f3[1] = 0.0; f3[2] = 0.0;
--- a/src/USER-OMP/msm_cg_omp.cpp
+++ b/src/USER-OMP/msm_cg_omp.cpp
@ -0,0 +1,522 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+   Original MSM class by: Paul Crozier, Stan Moore, Stephen Bond, (all SNL)
+------------------------------------------------------------------------- */
+
+#include "lmptype.h"
+#include "mpi.h"
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+
+#include "atom.h"
+#include "commgrid.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "msm_cg_omp.h"
+
+#include "math_const.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define OFFSET 16384
+#define SMALLQ 0.00001
+
+enum{REVERSE_RHO,REVERSE_AD,REVERSE_AD_PERATOM};
+enum{FORWARD_RHO,FORWARD_AD,FORWARD_AD_PERATOM};
+
+/* ---------------------------------------------------------------------- */
+
+MSMCGOMP::MSMCGOMP(LAMMPS *lmp, int narg, char **arg) : MSMOMP(lmp, narg, arg)
+{
+  if ((narg < 1) || (narg > 2))
+    error->all(FLERR,"Illegal kspace_style msm/cg/omp command");
+
+  if (narg == 2)
+    smallq = atof(arg[1]);
+  else
+    smallq = SMALLQ;
+
+  num_charged = -1;
+  is_charged = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   free all memory
+------------------------------------------------------------------------- */
+
+MSMCGOMP::~MSMCGOMP()
+{
+  memory->destroy(is_charged);
+}
+
+
+/* ----------------------------------------------------------------------
+   compute the MSM long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+void MSMCGOMP::compute(int eflag, int vflag)
+{
+  const double * const q = atom->q;
+  const int nlocal = atom->nlocal;
+  int i,j,n;
+
+  // set energy/virial flags
+  // invoke allocate_peratom() if needed for first time
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = evflag_atom = eflag_global = vflag_global =
+    eflag_atom = vflag_atom = eflag_either = vflag_either = 0;
+
+  if (vflag_atom && !peratom_allocate_flag) {
+    allocate_peratom();
+    for (n=0; n<levels; n++) {
+      cg_peratom[n]->ghost_notify();
+      cg_peratom[n]->setup();
+    }
+    peratom_allocate_flag = 1;
+  }
+
+  // extend size of per-atom arrays if necessary
+
+  if (nlocal > nmax) {
+    memory->destroy(part2grid);
+    memory->destroy(is_charged);
+    nmax = atom->nmax;
+    memory->create(part2grid,nmax,3,"msm:part2grid");
+    memory->create(is_charged,nmax,"msm/cg:is_charged");
+  }
+
+  // one time setup message
+
+  if (num_charged < 0) {
+    bigint charged_all, charged_num;
+    double charged_frac, charged_fmax, charged_fmin;
+
+    num_charged=0;
+    for (i=0; i < nlocal; ++i)
+      if (fabs(q[i]) > smallq)
+        ++num_charged;
+
+    // get fraction of charged particles per domain
+
+    if (nlocal > 0)
+      charged_frac = static_cast<double>(num_charged) * 100.0
+                   / static_cast<double>(nlocal);
+    else
+      charged_frac = 0.0;
+
+    MPI_Reduce(&charged_frac,&charged_fmax,1,MPI_DOUBLE,MPI_MAX,0,world);
+    MPI_Reduce(&charged_frac,&charged_fmin,1,MPI_DOUBLE,MPI_MIN,0,world);
+
+    // get fraction of charged particles overall
+
+    charged_num = num_charged;
+    MPI_Reduce(&charged_num,&charged_all,1,MPI_LMP_BIGINT,MPI_SUM,0,world);
+    charged_frac = static_cast<double>(charged_all) * 100.0
+                   / static_cast<double>(atom->natoms);
+
+    if (me == 0) {
+      if (screen)
+        fprintf(screen,
+                "  MSM/cg optimization cutoff: %g\n"
+                "  Total charged atoms: %.1f%%\n"
+                "  Min/max charged atoms/proc: %.1f%% %.1f%%\n",
+                smallq,charged_frac,charged_fmin,charged_fmax);
+      if (logfile)
+        fprintf(logfile,
+                "  MSM/cg optimization cutoff: %g\n"
+                "  Total charged atoms: %.1f%%\n"
+                "  Min/max charged atoms/proc: %.1f%% %.1f%%\n",
+                smallq,charged_frac,charged_fmin,charged_fmax);
+    }
+  }
+
+  num_charged = 0;
+  for (i = 0; i < nlocal; ++i)
+    if (fabs(q[i]) > smallq) {
+      is_charged[num_charged] = i;
+      ++num_charged;
+    }
+
+  // find grid points for all my particles
+  // map my particle charge onto my local 3d density grid (aninterpolation)
+
+  particle_map();
+  make_rho();
+
+  current_level = 0;
+  cg[0]->reverse_comm(this,REVERSE_RHO);
+
+  // all procs communicate density values from their ghost cells
+  //   to fully sum contribution in their 3d bricks
+
+  for (n=0; n<=levels-2; n++) {
+    current_level = n;
+    cg[n]->forward_comm(this,FORWARD_RHO);
+
+    direct(n);
+    restriction(n);
+  }
+
+  // top grid level
+
+  current_level = levels-1;
+  cg[levels-1]->forward_comm(this,FORWARD_RHO);
+  direct_top(levels-1);
+
+  for (n=levels-2; n>=0; n--) {
+
+    prolongation(n);
+
+    current_level = n;
+    cg[n]->reverse_comm(this,REVERSE_AD);
+
+    // extra per-atom virial communication
+
+    if (vflag_atom)
+      cg_peratom[n]->reverse_comm(this,REVERSE_AD_PERATOM);
+  }
+
+  // all procs communicate E-field values
+  // to fill ghost cells surrounding their 3d bricks
+
+  current_level = 0;
+
+  cg[0]->forward_comm(this,FORWARD_AD);
+
+  // extra per-atom energy/virial communication
+
+  if (vflag_atom)
+    cg_peratom[0]->forward_comm(this,FORWARD_AD_PERATOM);
+
+  // calculate the force on my particles (interpolation)
+
+  fieldforce();
+
+  // calculate the per-atom energy for my particles
+
+  if (evflag_atom) fieldforce_peratom();
+
+  const double qscale = force->qqrd2e * scale;
+
+  // Total long-range energy
+
+  if (eflag_global) {
+    double energy_all;
+    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy = energy_all;
+
+    double e_self = qsqsum*gamma(0.0)/cutoff;  // Self-energy term
+    energy -= e_self;
+    energy *= 0.5*qscale;
+  }
+
+  // Total long-range virial
+
+  if (vflag_global) {
+    double virial_all[6];
+    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*virial_all[i];
+  }
+
+  // per-atom energy/virial
+  // energy includes self-energy correction
+
+  if (evflag_atom) {
+    const double qs = 0.5*qscale;
+
+    if (eflag_atom) {
+      const double sf = gamma(0.0)/cutoff;
+      for (j = 0; j < num_charged; j++) {
+        i = is_charged[j];
+        eatom[i] -= q[i]*q[i]*sf;
+        eatom[i] *= qs;
+      }
+    }
+
+    if (vflag_atom) {
+      for (n = 0; n < num_charged; n++) {
+        i = is_charged[n];
+        for (j = 0; j < 6; j++)
+          vatom[i][j] *= qs;
+      }
+    }
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+
+/* ----------------------------------------------------------------------
+   find center grid pt for each of my particles
+   check that full stencil for the particle will fit in my 3d brick
+   store central grid pt indices in part2grid array
+------------------------------------------------------------------------- */
+
+void MSMCGOMP::particle_map()
+{
+  const double * const * const x = atom->x;
+
+  int flag = 0;
+  int i;
+
+  // XXX: O(N). is it worth to add OpenMP here?
+  for (int j = 0; j < num_charged; j++) {
+    i = is_charged[j];
+
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // current particle coord can be outside global and local box
+    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+    const int nx=static_cast<int>((x[i][0]-boxlo[0])*delxinv[0]+OFFSET)-OFFSET;
+    const int ny=static_cast<int>((x[i][1]-boxlo[1])*delyinv[0]+OFFSET)-OFFSET;
+    const int nz=static_cast<int>((x[i][2]-boxlo[2])*delzinv[0]+OFFSET)-OFFSET;
+
+    part2grid[i][0] = nx;
+    part2grid[i][1] = ny;
+    part2grid[i][2] = nz;
+
+    // check that entire stencil around nx,ny,nz will fit in my 3d brick
+
+    if (nx+nlower < nxlo_out[0] || nx+nupper > nxhi_out[0] ||
+        ny+nlower < nylo_out[0] || ny+nupper > nyhi_out[0] ||
+        nz+nlower < nzlo_out[0] || nz+nupper > nzhi_out[0])
+      flag = 1;
+  }
+
+  if (flag) error->one(FLERR,"Out of range atoms - cannot compute MSM");
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid
+------------------------------------------------------------------------- */
+
+void MSMCGOMP::make_rho()
+{
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
+
+  // clear 3d density array
+
+  double * const * const * const qgridn = qgrid[0];
+
+  memset(&(qgridn[nzlo_out[0]][nylo_out[0]][nxlo_out[0]]),0,ngrid[0]*sizeof(double));
+
+  double dx,dy,dz,x0,y0,z0;
+  int i,j,l,m,n,nx,ny,nz,mx,my,mz;
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  for (j = 0; j < num_charged; j++) {
+    i = is_charged[j];
+
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx - (x[i][0]-boxlo[0])*delxinv[0];
+    dy = ny - (x[i][1]-boxlo[1])*delyinv[0];
+    dz = nz - (x[i][2]-boxlo[2])*delzinv[0];
+
+    compute_phis_and_dphis(dx,dy,dz);
+
+    z0 = q[i];
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      y0 = z0*phi1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        x0 = y0*phi1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          qgridn[mz][my][mx] += x0*phi1d[0][l];
+        }
+      }
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get force on my particles
+------------------------------------------------------------------------- */
+
+void MSMCGOMP::fieldforce()
+{
+
+  const double * const * const * const egridn = egrid[0];
+  const double * const * const x = atom->x;
+  double * const * const f = atom->f;
+  const double * const q = atom->q;
+
+  int i,j,l,m,n,nx,ny,nz,mx,my,mz;
+  double dx,dy,dz;
+  double phi_x,phi_y,phi_z;
+  double dphi_x,dphi_y,dphi_z;
+  double ekx,eky,ekz;
+
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  for (j = 0; j < num_charged; j++) {
+    i = is_charged[j];
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx - (x[i][0]-boxlo[0])*delxinv[0];
+    dy = ny - (x[i][1]-boxlo[1])*delyinv[0];
+    dz = nz - (x[i][2]-boxlo[2])*delzinv[0];
+
+    compute_phis_and_dphis(dx,dy,dz);
+
+    ekx = eky = ekz = 0.0;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      phi_z = phi1d[2][n];
+      dphi_z = dphi1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        phi_y = phi1d[1][m];
+        dphi_y = dphi1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          phi_x = phi1d[0][l];
+          dphi_x = dphi1d[0][l];
+          ekx += dphi_x*phi_y*phi_z*egridn[mz][my][mx];
+          eky += phi_x*dphi_y*phi_z*egridn[mz][my][mx];
+          ekz += phi_x*phi_y*dphi_z*egridn[mz][my][mx];
+        }
+      }
+    }
+
+    ekx *= delxinv[0];
+    eky *= delyinv[0];
+    ekz *= delzinv[0];
+
+    // convert E-field to force
+
+    const double qfactor = force->qqrd2e*scale*q[i];
+    f[i][0] += qfactor*ekx;
+    f[i][1] += qfactor*eky;
+    f[i][2] += qfactor*ekz;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get per-atom energy/virial
+------------------------------------------------------------------------- */
+
+void MSMCGOMP::fieldforce_peratom()
+{
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
+
+  double ***egridn = egrid[0];
+
+  double ***v0gridn = v0grid[0];
+  double ***v1gridn = v1grid[0];
+  double ***v2gridn = v2grid[0];
+  double ***v3gridn = v3grid[0];
+  double ***v4gridn = v4grid[0];
+  double ***v5gridn = v5grid[0];
+
+  int i,j,l,m,n,nx,ny,nz,mx,my,mz;
+  double dx,dy,dz,x0,y0,z0;
+  double u,v0,v1,v2,v3,v4,v5;
+
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  for (j = 0; j < num_charged; j++) {
+    i = is_charged[j];
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx - (x[i][0]-boxlo[0])*delxinv[0];
+    dy = ny - (x[i][1]-boxlo[1])*delyinv[0];
+    dz = nz - (x[i][2]-boxlo[2])*delzinv[0];
+
+    compute_phis_and_dphis(dx,dy,dz);
+
+    u = v0 = v1 = v2 = v3 = v4 = v5 = 0.0;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      z0 = phi1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        y0 = z0*phi1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          x0 = y0*phi1d[0][l];
+          if (eflag_atom) u += x0*egridn[mz][my][mx];
+          if (vflag_atom) {
+            v0 += x0*v0gridn[mz][my][mx];
+            v1 += x0*v1gridn[mz][my][mx];
+            v2 += x0*v2gridn[mz][my][mx];
+            v3 += x0*v3gridn[mz][my][mx];
+            v4 += x0*v4gridn[mz][my][mx];
+            v5 += x0*v5gridn[mz][my][mx];
+          }
+        }
+      }
+    }
+
+    if (eflag_atom) eatom[i] += q[i]*u;
+    if (vflag_atom) {
+      vatom[i][0] += q[i]*v0;
+      vatom[i][1] += q[i]*v1;
+      vatom[i][2] += q[i]*v2;
+      vatom[i][3] += q[i]*v3;
+      vatom[i][4] += q[i]*v4;
+      vatom[i][5] += q[i]*v5;
+    }
+  }
+}
+
+
+double MSMCGOMP::memory_usage()
+{
+  double bytes = MSM::memory_usage();
+  bytes += nmax * sizeof(int);
+  return bytes;
+}
--- a/src/USER-OMP/msm_cg_omp.h
+++ b/src/USER-OMP/msm_cg_omp.h
@ -0,0 +1,141 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(msm/cg/omp,MSMCGOMP)
+
+#else
+
+#ifndef LMP_MSM_CG_OMP_H
+#define LMP_MSM_CG_OMP_H
+
+#include "msm_omp.h"
+
+namespace LAMMPS_NS {
+
+class MSMCGOMP : public MSMOMP {
+ public:
+  MSMCGOMP(class LAMMPS *, int, char **);
+  virtual ~MSMCGOMP();
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ protected:
+  int num_charged;
+  int *is_charged;
+  double smallq;
+
+ protected:
+  virtual void particle_map();
+  virtual void make_rho();
+  virtual void fieldforce();
+  virtual void fieldforce_peratom();
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Cannot (yet) use MSM with triclinic box
+
+This feature is not yet supported.
+
+E: Cannot (yet) use MSM with 2d simulation
+
+This feature is not yet supported.
+
+E: Kspace style requires atom attribute q
+
+The atom style defined does not have these attributes.
+
+E: Cannot use slab correction with MSM
+
+Slab correction can only be used with Ewald and PPPM, not MSM.
+
+E: MSM order must be 4, 6, 8, or 10
+
+This is a limitation of the MSM implementation in LAMMPS:
+the MSM order can only be 4, 6, 8, or 10.
+
+E: Cannot (yet) use single precision with MSM (remove -DFFT_SINGLE from Makefile and recompile)
+
+Single precision cannot be used with MSM.
+
+E: KSpace style is incompatible with Pair style
+
+Setting a kspace style requires that a pair style with a long-range
+Coulombic component be selected that is compatible with MSM.  Note
+that TIP4P is not (yet) supported by MSM.
+
+E: Cannot use kspace solver on system with no charge
+
+No atoms in system have a non-zero charge.
+
+E: System is not charge neutral, net charge = %g
+
+The total charge on all atoms on the system is not 0.0, which
+is not valid for MSM.
+
+E: MSM grid is too large
+
+The global MSM grid is larger than OFFSET in one or more dimensions.
+OFFSET is currently set to 16384.  You likely need to decrease the
+requested accuracy.
+
+W: MSM mesh too small, increasing to 2 points in each direction
+
+The global MSM grid is too small, so the number of grid points has been
+increased
+
+E: KSpace accuracy must be > 0
+
+The kspace accuracy designated in the input must be greater than zero.
+
+W: Number of MSM mesh points increased to be a multiple of 2
+
+MSM requires that the number of grid points in each direction be a multiple
+of two and the number of grid points in one or more directions have been
+adjusted to meet this requirement.
+
+W: Adjusting Coulombic cutoff for MSM, new cutoff = %g
+
+The adjust/cutoff command is turned on and the Coulombic cutoff has been
+adjusted to match the user-specified accuracy.
+
+E: Out of range atoms - cannot compute MSM
+
+One or more atoms are attempting to map their charge to a MSM grid point
+that is not owned by a processor.  This is likely for one of two
+reasons, both of them bad.  First, it may mean that an atom near the
+boundary of a processor's sub-domain has moved more than 1/2 the
+"neighbor skin distance"_neighbor.html without neighbor lists being
+rebuilt and atoms being migrated to new processors.  This also means
+you may be missing pairwise interactions that need to be computed.
+The solution is to change the re-neighboring criteria via the
+"neigh_modify"_neigh_modify command.  The safest settings are "delay 0
+every 1 check yes".  Second, it may mean that an atom has moved far
+outside a processor's sub-domain or even the entire simulation box.
+This indicates bad physics, e.g. due to highly overlapping atoms, too
+large a timestep, etc.
+
+*/
--- a/src/USER-OMP/pair_airebo_omp.cpp
+++ b/src/USER-OMP/pair_airebo_omp.cpp
@ -19,6 +19,7 @@
 #include "error.h"
 #include "force.h"
 #include "memory.h"
+#include "math_special.h"
 #include "neighbor.h"
 #include "neigh_list.h"

@ -28,6 +29,7 @@

 #include "suffix.h"
 using namespace LAMMPS_NS;
+using namespace MathSpecial;

 #define TOL 1.0e-9

@ -2442,7 +2444,7 @@ void PairAIREBOOMP::TORSION_thr(int ifrom, int ito,
          cw2 = (.5*(1.0-cw));
          ekijl = epsilonT[ktype][ltype];
          Ec = 256.0*ekijl/405.0;
-          Vtors = (Ec*(pow(cw2,5.0)))-(ekijl/10.0);
+          Vtors = (Ec*(powint(cw2,5)))-(ekijl/10.0);

          if (eflag) evdwl = Vtors*w21*w23*w34*(1.0-tspjik)*(1.0-tspijl);

@ -2496,7 +2498,7 @@ void PairAIREBOOMP::TORSION_thr(int ifrom, int ito,
          ddndil = cross321mag*dxjdil;
          dcwddn = -cwnum/(cwnom*cwnom);
          dcwdn = 1.0/cwnom;
-          dvpdcw = (-1.0)*Ec*(-.5)*5.0*pow(cw2,4.0) *
+          dvpdcw = (-1.0)*Ec*(-.5)*5.0*powint(cw2,4) *
            w23*w21*w34*(1.0-tspjik)*(1.0-tspijl);

          Ftmp[0] = dvpdcw*((dcwdn*dndij[0])+(dcwddn*ddndij*del23[0]/r23));
--- a/src/USER-OMP/pair_beck_omp.cpp
+++ b/src/USER-OMP/pair_beck_omp.cpp
@ -19,9 +19,11 @@
 #include "force.h"
 #include "neighbor.h"
 #include "neigh_list.h"
+#include "math_special.h"

 #include "suffix.h"
 using namespace LAMMPS_NS;
+using namespace MathSpecial;

 /* ---------------------------------------------------------------------- */

@ -126,7 +128,7 @@ void PairBeckOMP::eval(int iifrom, int iito, ThrData * const thr)
        alphaij = alpha[itype][jtype];
        betaij = beta[itype][jtype];
        term1 = aaij*aaij + rsq;
-        term2 = 1.0/pow(term1,5.0);
+        term2 = powint(term1,-5);
        term3 = 21.672 + 30.0*aaij*aaij + 6.0*rsq;
        term4 = alphaij + r5*betaij;
        term5 = alphaij + 6.0*r5*betaij;
@ -146,7 +148,7 @@ void PairBeckOMP::eval(int iifrom, int iito, ThrData * const thr)
        }

        if (EFLAG) {
-          term6 = 1.0/pow(term1,3.0);
+          term6 = powint(term1,-3);
          term1inv = 1.0/term1;
          evdwl = AA[itype][jtype]*exp(-1.0*r*term4);
          evdwl -= BB[itype][jtype]*term6*(1.0+(2.709+3.0*aaij*aaij)*term1inv);
--- a/src/USER-OMP/pair_brownian_omp.cpp
+++ b/src/USER-OMP/pair_brownian_omp.cpp
@ -25,12 +25,14 @@
 #include "variable.h"
 #include "random_mars.h"
 #include "math_const.h"
+#include "math_special.h"

 #include "fix_wall.h"

 #include "suffix.h"
 using namespace LAMMPS_NS;
 using namespace MathConst;
+using namespace MathSpecial;

 #define EPSILON 1.0e-10

@ -105,11 +107,11 @@ void PairBrownianOMP::compute(int eflag, int vflag)
      double vol_f = vol_P/vol_T;
      if (flaglog == 0) {
        R0  = 6*MY_PI*mu*rad*(1.0 + 2.16*vol_f);
-        RT0 = 8*MY_PI*mu*pow(rad,3.0);
+        RT0 = 8*MY_PI*mu*cube(rad);
        //RS0 = 20.0/3.0*MY_PI*mu*pow(rad,3)*(1.0 + 3.33*vol_f + 2.80*vol_f*vol_f);
      } else {
        R0  = 6*MY_PI*mu*rad*(1.0 + 2.725*vol_f - 6.583*vol_f*vol_f);
-        RT0 = 8*MY_PI*mu*pow(rad,3.0)*(1.0 + 0.749*vol_f - 2.469*vol_f*vol_f);
+        RT0 = 8*MY_PI*mu*cube(rad)*(1.0 + 0.749*vol_f - 2.469*vol_f*vol_f);
        //RS0 = 20.0/3.0*MY_PI*mu*pow(rad,3)*(1.0 + 3.64*vol_f - 6.95*vol_f*vol_f);
      }
    }
@ -254,7 +256,7 @@ void PairBrownianOMP::eval(int iifrom, int iito, ThrData * const thr)
        if (FLAGLOG) {
          a_sq = 6.0*MY_PI*mu*radi*(1.0/4.0/h_sep + 9.0/40.0*log(1.0/h_sep));
          a_sh = 6.0*MY_PI*mu*radi*(1.0/6.0*log(1.0/h_sep));
-          a_pu = 8.0*MY_PI*mu*pow(radi,3.0)*(3.0/160.0*log(1.0/h_sep));
+          a_pu = 8.0*MY_PI*mu*cube(radi)*(3.0/160.0*log(1.0/h_sep));
        } else
          a_sq = 6.0*MY_PI*mu*radi*(1.0/4.0/h_sep);

--- a/src/USER-OMP/pair_brownian_poly_omp.cpp
+++ b/src/USER-OMP/pair_brownian_poly_omp.cpp
@ -27,10 +27,12 @@
 #include "fix_wall.h"

 #include "math_const.h"
+#include "math_special.h"

 #include "suffix.h"
 using namespace LAMMPS_NS;
 using namespace MathConst;
+using namespace MathSpecial;

 #define EPSILON 1.0e-10

@ -105,11 +107,11 @@ void PairBrownianPolyOMP::compute(int eflag, int vflag)
      double vol_f = vol_P/vol_T;
      if (flaglog == 0) {
        R0  = 6*MY_PI*mu*rad*(1.0 + 2.16*vol_f);
-        RT0 = 8*MY_PI*mu*pow(rad,3.0);
+        RT0 = 8*MY_PI*mu*cube(rad);
        //RS0 = 20.0/3.0*MY_PI*mu*pow(rad,3)*(1.0 + 3.33*vol_f + 2.80*vol_f*vol_f);
      } else {
        R0  = 6*MY_PI*mu*rad*(1.0 + 2.725*vol_f - 6.583*vol_f*vol_f);
-        RT0 = 8*MY_PI*mu*pow(rad,3.0)*(1.0 + 0.749*vol_f - 2.469*vol_f*vol_f);
+        RT0 = 8*MY_PI*mu*cube(rad)*(1.0 + 0.749*vol_f - 2.469*vol_f*vol_f);
        //RS0 = 20.0/3.0*MY_PI*mu*pow(rad,3)*(1.0 + 3.64*vol_f - 6.95*vol_f*vol_f);
      }
    }
@ -250,20 +252,20 @@ void PairBrownianPolyOMP::eval(int iifrom, int iito, ThrData * const thr)

        if (FLAGLOG) {
          a_sq = beta0*beta0/beta1/beta1/h_sep +
-            (1.0+7.0*beta0+beta0*beta0)/5.0/pow(beta1,3.0)*log(1.0/h_sep);
-          a_sq += (1.0+18.0*beta0-29.0*beta0*beta0+18.0*pow(beta0,3.0) +
-                   pow(beta0,4.0))/21.0/pow(beta1,4.0)*h_sep*log(1.0/h_sep);
+            (1.0+7.0*beta0+beta0*beta0)/5.0/cube(beta1)*log(1.0/h_sep);
+          a_sq += (1.0+18.0*beta0-29.0*beta0*beta0+18.0*cube(beta0) +
+                   powint(beta0,4))/21.0/powint(beta1,4)*h_sep*log(1.0/h_sep);
          a_sq *= 6.0*MY_PI*mu*radi;
-          a_sh = 4.0*beta0*(2.0+beta0+2.0*beta0*beta0)/15.0/pow(beta1,3.0) *
+          a_sh = 4.0*beta0*(2.0+beta0+2.0*beta0*beta0)/15.0/cube(beta1) *
            log(1.0/h_sep);
-          a_sh += 4.0*(16.0-45.0*beta0+58.0*beta0*beta0-45.0*pow(beta0,3.0) +
-                       16.0*pow(beta0,4.0))/375.0/pow(beta1,4.0) *
+          a_sh += 4.0*(16.0-45.0*beta0+58.0*beta0*beta0-45.0*cube(beta0) +
+                       16.0*powint(beta0,4))/375.0/powint(beta1,4) *
            h_sep*log(1.0/h_sep);
          a_sh *= 6.0*MY_PI*mu*radi;
          a_pu = beta0*(4.0+beta0)/10.0/beta1/beta1*log(1.0/h_sep);
          a_pu += (32.0-33.0*beta0+83.0*beta0*beta0+43.0 *
-                   pow(beta0,3.0))/250.0/pow(beta1,3.0)*h_sep*log(1.0/h_sep);
-          a_pu *= 8.0*MY_PI*mu*pow(radi,3.0);
+                   cube(beta0))/250.0/cube(beta1)*h_sep*log(1.0/h_sep);
+          a_pu *= 8.0*MY_PI*mu*cube(radi);

        } else a_sq = 6.0*MY_PI*mu*radi*(beta0*beta0/beta1/beta1/h_sep);

--- a/src/USER-OMP/pair_colloid_omp.cpp
+++ b/src/USER-OMP/pair_colloid_omp.cpp
@ -20,9 +20,11 @@
 #include "force.h"
 #include "neighbor.h"
 #include "neigh_list.h"
+#include "math_special.h"

 #include "suffix.h"
 using namespace LAMMPS_NS;
+using namespace MathSpecial;

 /* ---------------------------------------------------------------------- */

@ -169,10 +171,10 @@ void PairColloidOMP::eval(int iifrom, int iito, ThrData * const thr)
        K[6] = K[2]-r;
        K[7] = 1.0/(K[3]*K[4]);
        K[8] = 1.0/(K[5]*K[6]);
-        g[0] = pow(K[3],-7.0);
-        g[1] = pow(K[4],-7.0);
-        g[2] = pow(K[5],-7.0);
-        g[3] = pow(K[6],-7.0);
+        g[0] = powint(K[3],-7);
+        g[1] = powint(K[4],-7);
+        g[2] = powint(K[5],-7);
+        g[3] = powint(K[6],-7);
        h[0] = ((K[3]+5.0*K[1])*K[3]+30.0*K[0])*g[0];
        h[1] = ((K[4]+5.0*K[1])*K[4]+30.0*K[0])*g[1];
        h[2] = ((K[5]+5.0*K[2])*K[5]-30.0*K[0])*g[2];
--- a/src/USER-OMP/pair_dipole_sf_omp.cpp
+++ b/src/USER-OMP/pair_dipole_sf_omp.cpp
@ -274,8 +274,9 @@ void PairDipoleSFOMP::eval(int iifrom, int iito, ThrData * const thr)

        if (EFLAG) {
          if (rsq < cut_coulsq[itype][jtype]) {
-            ecoul = qtmp * q[j] * rinv *
-              pow((1.0-sqrt(rsq)/sqrt(cut_coulsq[itype][jtype])),2.0);
+            ecoul = (1.0-sqrt(rsq)/sqrt(cut_coulsq[itype][jtype]));
+            ecoul *= ecoul;
+            ecoul *= qtmp * q[j] * rinv;
            if (mu[i][3] > 0.0 && mu[j][3] > 0.0)
              ecoul += bfac * (r3inv*pdotp - 3.0*r5inv*pidotr*pjdotr);
            if (mu[i][3] > 0.0 && q[j] != 0.0)
--- a/src/USER-OMP/pair_hbond_dreiding_lj_omp.cpp
+++ b/src/USER-OMP/pair_hbond_dreiding_lj_omp.cpp
@ -22,10 +22,12 @@
 #include "neigh_list.h"

 #include "math_const.h"
+#include "math_special.h"

 #include "suffix.h"
 using namespace LAMMPS_NS;
 using namespace MathConst;
+using namespace MathSpecial;

 #define SMALL 0.001

@ -119,7 +121,6 @@ void PairHbondDreidingLJOMP::eval(int iifrom, int iito, ThrData * const thr)
  double r2inv,r10inv;
  double switch1,switch2;
  int *ilist,*jlist,*numneigh,**firstneigh;
-  Param *pm;

  evdwl = 0.0;

@ -178,9 +179,9 @@ void PairHbondDreidingLJOMP::eval(int iifrom, int iito, ThrData * const thr)
        ktype = type[k];
        m = type2param[itype][jtype][ktype];
        if (m < 0) continue;
-        pm = &params[m];
+        const Param &pm = params[m];

-        if (rsq < pm->cut_outersq) {
+        if (rsq < pm.cut_outersq) {
          delr1[0] = xtmp - x[k][0];
          delr1[1] = ytmp - x[k][1];
          delr1[2] = ztmp - x[k][2];
@ -203,7 +204,7 @@ void PairHbondDreidingLJOMP::eval(int iifrom, int iito, ThrData * const thr)
          if (c < -1.0) c = -1.0;
          ac = acos(c);

-          if (ac > pm->cut_angle && ac < (2.0*MY_PI - pm->cut_angle)) {
+          if (ac > pm.cut_angle && ac < (2.0*MY_PI - pm.cut_angle)) {
            s = sqrt(1.0 - c*c);
            if (s < SMALL) s = SMALL;

@ -211,24 +212,24 @@ void PairHbondDreidingLJOMP::eval(int iifrom, int iito, ThrData * const thr)

            r2inv = 1.0/rsq;
            r10inv = r2inv*r2inv*r2inv*r2inv*r2inv;
-            force_kernel = r10inv*(pm->lj1*r2inv - pm->lj2)*r2inv *
-              pow(c,(double)pm->ap);
-            force_angle = pm->ap * r10inv*(pm->lj3*r2inv - pm->lj4) *
-              pow(c,pm->ap-1.0)*s;
+            force_kernel = r10inv*(pm.lj1*r2inv - pm.lj2)*r2inv *
+              powint(c,pm.ap);
+            force_angle = pm.ap * r10inv*(pm.lj3*r2inv - pm.lj4) *
+              powint(c,pm.ap-1)*s;

-            eng_lj = r10inv*(pm->lj3*r2inv - pm->lj4);
-            if (rsq > pm->cut_innersq) {
-              switch1 = (pm->cut_outersq-rsq) * (pm->cut_outersq-rsq) *
-                        (pm->cut_outersq + 2.0*rsq - 3.0*pm->cut_innersq) /
-                        pm->denom_vdw;
-              switch2 = 12.0*rsq * (pm->cut_outersq-rsq) *
-                        (rsq-pm->cut_innersq) / pm->denom_vdw;
+            eng_lj = r10inv*(pm.lj3*r2inv - pm.lj4);
+            if (rsq > pm.cut_innersq) {
+              switch1 = (pm.cut_outersq-rsq) * (pm.cut_outersq-rsq) *
+                        (pm.cut_outersq + 2.0*rsq - 3.0*pm.cut_innersq) /
+                        pm.denom_vdw;
+              switch2 = 12.0*rsq * (pm.cut_outersq-rsq) *
+                        (rsq-pm.cut_innersq) / pm.denom_vdw;
              force_kernel = force_kernel*switch1 + eng_lj*switch2;
              eng_lj *= switch1;
            }

            if (EFLAG) {
-              evdwl = eng_lj * pow(c,(double)pm->ap);
+              evdwl = eng_lj * powint(c,pm.ap);
              evdwl *= factor_hb;
            }

--- a/src/USER-OMP/pair_hbond_dreiding_morse_omp.cpp
+++ b/src/USER-OMP/pair_hbond_dreiding_morse_omp.cpp
@ -22,10 +22,12 @@
 #include "neigh_list.h"

 #include "math_const.h"
+#include "math_special.h"

 #include "suffix.h"
 using namespace LAMMPS_NS;
 using namespace MathConst;
+using namespace MathSpecial;

 #define SMALL 0.001

@ -118,7 +120,6 @@ void PairHbondDreidingMorseOMP::eval(int iifrom, int iito, ThrData * const thr)
  double fi[3],fj[3],delr1[3],delr2[3];
  double r,dr,dexp,eng_morse,switch1,switch2;
  int *ilist,*jlist,*numneigh,**firstneigh;
-  Param *pm;

  evdwl = 0.0;

@ -177,9 +178,9 @@ void PairHbondDreidingMorseOMP::eval(int iifrom, int iito, ThrData * const thr)
        ktype = type[k];
        m = type2param[itype][jtype][ktype];
        if (m < 0) continue;
-        pm = &params[m];
+        const Param &pm = params[m];

-        if (rsq < pm->cut_outersq) {
+        if (rsq < pm.cut_outersq) {
          delr1[0] = xtmp - x[k][0];
          delr1[1] = ytmp - x[k][1];
          delr1[2] = ztmp - x[k][2];
@ -202,31 +203,31 @@ void PairHbondDreidingMorseOMP::eval(int iifrom, int iito, ThrData * const thr)
          if (c < -1.0) c = -1.0;
          ac = acos(c);

-          if (ac > pm->cut_angle && ac < (2.0*MY_PI - pm->cut_angle)) {
+          if (ac > pm.cut_angle && ac < (2.0*MY_PI - pm.cut_angle)) {
            s = sqrt(1.0 - c*c);
            if (s < SMALL) s = SMALL;

            // Morse-specific kernel

            r = sqrt(rsq);
-            dr = r - pm->r0;
-            dexp = exp(-pm->alpha * dr);
-            eng_morse = pm->d0 * (dexp*dexp - 2.0*dexp);
-            force_kernel = pm->morse1*(dexp*dexp - dexp)/r * pow(c,(double)pm->ap);
-            force_angle = pm->ap * eng_morse * pow(c,(double)pm->ap-1.0)*s;
+            dr = r - pm.r0;
+            dexp = exp(-pm.alpha * dr);
+            eng_morse = pm.d0 * (dexp*dexp - 2.0*dexp);
+            force_kernel = pm.morse1*(dexp*dexp - dexp)/r * powint(c,pm.ap);
+            force_angle = pm.ap * eng_morse * powint(c,pm.ap-1)*s;

-            if (rsq > pm->cut_innersq) {
-              switch1 = (pm->cut_outersq-rsq) * (pm->cut_outersq-rsq) *
-                        (pm->cut_outersq + 2.0*rsq - 3.0*pm->cut_innersq) /
-                        pm->denom_vdw;
-              switch2 = 12.0*rsq * (pm->cut_outersq-rsq) *
-                        (rsq-pm->cut_innersq) / pm->denom_vdw;
+            if (rsq > pm.cut_innersq) {
+              switch1 = (pm.cut_outersq-rsq) * (pm.cut_outersq-rsq) *
+                        (pm.cut_outersq + 2.0*rsq - 3.0*pm.cut_innersq) /
+                        pm.denom_vdw;
+              switch2 = 12.0*rsq * (pm.cut_outersq-rsq) *
+                        (rsq-pm.cut_innersq) / pm.denom_vdw;
              force_kernel = force_kernel*switch1 + eng_morse*switch2;
              eng_morse *= switch1;
            }

            if (EFLAG) {
-              evdwl = eng_morse * pow(c,(double)params[m].ap);
+              evdwl = eng_morse * powint(c,pm.ap);
              evdwl *= factor_hb;
            }

--- a/src/USER-OMP/pair_lj_sdk_coul_msm_omp.cpp
+++ b/src/USER-OMP/pair_lj_sdk_coul_msm_omp.cpp
@ -0,0 +1,229 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+   This style is a simplified re-implementation of the CG/CMM pair style
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_sdk_coul_msm_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "lj_sdk_common.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace LJSDKParms;
+/* ---------------------------------------------------------------------- */
+
+PairLJSDKCoulMSMOMP::PairLJSDKCoulMSMOMP(LAMMPS *lmp) :
+  PairLJSDKCoulMSM(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJSDKCoulMSMOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval_msm_thr<1,1,1>(ifrom, ito, thr);
+        else eval_msm_thr<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval_msm_thr<1,0,1>(ifrom, ito, thr);
+        else eval_msm_thr<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval_msm_thr<0,0,1>(ifrom, ito, thr);
+      else eval_msm_thr<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJSDKCoulMSMOMP::eval_msm_thr(int iifrom, int iito, ThrData * const thr)
+{
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+
+  const int * const ilist = list->ilist;
+  const int * const numneigh = list->numneigh;
+  const int * const * const firstneigh = list->firstneigh;
+  const int nlocal = atom->nlocal;
+
+  // loop over neighbors of my atoms
+
+  for (int ii = iifrom; ii < iito; ++ii) {
+
+    const int i = ilist[ii];
+    const int itype = type[i];
+    const double qtmp = q[i];
+    const double xtmp = x[i][0];
+    const double ytmp = x[i][1];
+    const double ztmp = x[i][2];
+    double fxtmp,fytmp,fztmp;
+    fxtmp=fytmp=fztmp=0.0;
+
+    const int * const jlist = firstneigh[i];
+    const int jnum = numneigh[i];
+
+    for (int jj = 0; jj < jnum; jj++) {
+      double forcecoul, forcelj, evdwl, ecoul, fgamma, egamma;
+      forcecoul = forcelj = evdwl = ecoul = 0.0;
+
+      const int sbindex = sbmask(jlist[jj]);
+      const int j = jlist[jj] & NEIGHMASK;
+
+      const double delx = xtmp - x[j][0];
+      const double dely = ytmp - x[j][1];
+      const double delz = ztmp - x[j][2];
+      const double rsq = delx*delx + dely*dely + delz*delz;
+      const int jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        const double r2inv = 1.0/rsq;
+        const int ljt = lj_type[itype][jtype];
+
+        if (rsq < cut_coulsq) {
+          if (!ncoultablebits || rsq <= tabinnersq) {
+            const double r = sqrt(rsq);
+            const double prefactor = qqrd2e * qtmp*q[j]/r;
+            fgamma = 1.0 + (rsq/cut_coulsq)*force->kspace->dgamma(r/cut_coul);
+            forcecoul = prefactor * fgamma;
+            if (EFLAG) {
+              egamma = 1.0 - (r/cut_coul)*force->kspace->gamma(r/cut_coul);
+              ecoul = prefactor*egamma;
+            }
+            if (sbindex) {
+              const double adjust = (1.0-special_coul[sbindex])*prefactor;
+              forcecoul -= adjust;
+              if (EFLAG) ecoul -= adjust;
+            }
+          } else {
+            union_int_float_t rsq_lookup;
+            rsq_lookup.f = rsq;
+            const int itable = (rsq_lookup.i & ncoulmask) >> ncoulshiftbits;
+            const double fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
+            const double table = ftable[itable] + fraction*dftable[itable];
+            forcecoul = qtmp*q[j] * table;
+            if (EFLAG) ecoul = qtmp*q[j] * (etable[itable] + fraction*detable[itable]);
+            if (sbindex) {
+              const double table2 = ctable[itable] + fraction*dctable[itable];
+              const double prefactor = qtmp*q[j] * table2;
+              const double adjust = (1.0-special_coul[sbindex])*prefactor;
+              forcecoul -= adjust;
+              if (EFLAG) ecoul -= adjust;
+            }
+          }
+        }
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+
+          if (ljt == LJ12_4) {
+            const double r4inv=r2inv*r2inv;
+            forcelj = r4inv*(lj1[itype][jtype]*r4inv*r4inv
+                             - lj2[itype][jtype]);
+
+            if (EFLAG)
+              evdwl = r4inv*(lj3[itype][jtype]*r4inv*r4inv
+                             - lj4[itype][jtype]) - offset[itype][jtype];
+
+          } else if (ljt == LJ9_6) {
+            const double r3inv = r2inv*sqrt(r2inv);
+            const double r6inv = r3inv*r3inv;
+            forcelj = r6inv*(lj1[itype][jtype]*r3inv
+                             - lj2[itype][jtype]);
+            if (EFLAG)
+              evdwl = r6inv*(lj3[itype][jtype]*r3inv
+                             - lj4[itype][jtype]) - offset[itype][jtype];
+
+          } else if (ljt == LJ12_6) {
+            const double r6inv = r2inv*r2inv*r2inv;
+            forcelj = r6inv*(lj1[itype][jtype]*r6inv
+                             - lj2[itype][jtype]);
+            if (EFLAG)
+              evdwl = r6inv*(lj3[itype][jtype]*r6inv
+                             - lj4[itype][jtype]) - offset[itype][jtype];
+          }
+
+          if (sbindex) {
+            const double factor_lj = special_lj[sbindex];
+            forcelj *= factor_lj;
+            if (EFLAG) evdwl *= factor_lj;
+          }
+
+        }
+
+        const double fpair = (forcecoul + forcelj) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJSDKCoulMSMOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJSDKCoulMSM::memory_usage();
+
+  return bytes;
+}
--- a/src/USER-OMP/pair_lj_sdk_coul_msm_omp.h
+++ b/src/USER-OMP/pair_lj_sdk_coul_msm_omp.h
@ -0,0 +1,49 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/sdk/coul/msm/omp,PairLJSDKCoulMSMOMP)
+PairStyle(cg/cmm/coul/msm/omp,PairLJSDKCoulMSMOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_SDK_COUL_MSM_OMP_H
+#define LMP_PAIR_LJ_SDK_COUL_MSM_OMP_H
+
+#include "pair_lj_sdk_coul_msm.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJSDKCoulMSMOMP : public PairLJSDKCoulMSM, public ThrOMP {
+
+ public:
+  PairLJSDKCoulMSMOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval_msm_thr(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/thr_omp.cpp
+++ b/src/USER-OMP/thr_omp.cpp
@ -178,6 +178,13 @@ void ThrOMP::reduce_thr(void *style, const int eflag, const int vflag,
          thr->virial_fdotr_compute(x, nlocal, nghost, -1);
        else
          thr->virial_fdotr_compute(x, nlocal, nghost, nfirst);
+      } else {
+        if (style == fix->last_pair_hybrid) {
+          // pair_style hybrid will compute fdotr for us
+          // but we first need to reduce the forces
+          data_reduce_thr(&(f[0][0]), nall, nthreads, 3, tid);
+          need_force_reduce = 0;
+        }
      }
    }